1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
| import requests from lxml import etree import pandas as pd from wordcloud import WordCloud import jieba import datetime
class BarrageSpider: def __init__(self, bv): self.bv = bv self.video_name = None self.barrage_url = 'https://comment.bilibili.com/{}.xml' self.date_url = 'https://api.bilibili.com/x/v2/dm/history?type=1&oid={}&date={}' self.index_url = 'https://api.bilibili.com/x/v2/dm/history/index?type=1&oid={}&month={}' self.bv_url = 'https://api.bilibili.com/x/player/pagelist?bvid=' + bv + '&jsonp=jsonp' self.video_url = 'https://www.bilibili.com/video/{}'.format(bv) self.comment = { 'referer': 'https://www.bilibili.com/', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66 ' } self.date_headers = { "referer": "https://www.bilibili.com/", "origin": "https://www.bilibili.com", "cookie": "你的cookie 爬很久远的视频 会被封ip 后面接收到的都是空结果", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66 " }
def get_cid(self): return requests.get(url=self.bv_url, headers=self.comment).json()['data'][0]['cid']
def get_video_time(self): time_data = requests.get(url=self.video_url, headers=self.comment).text video_page = etree.HTML(time_data) v_time = video_page.xpath('//div[@class="video-data"]/span[3]/text()')[0].split(' ')[0] self.video_name = video_page.xpath('//h1[@class="video-title"]/span/text()')[0] return v_time
def parse_url(self): cid = self.get_cid() response = requests.get(url=self.barrage_url.format(cid), headers=self.comment).content data = etree.HTML(response) barrage_list = data.xpath('//d') for barrage in barrage_list: info = barrage.xpath('./@p')[0].split(',') content = barrage.xpath('./text()')[0] item = {'出现时间': info[0], '弹幕模式': info[1], '字体大小': info[2], '颜色': info[3], '发送时间': info[4], '弹幕池': info[5], '用户ID': info[6], 'rowID': info[7], '内容': content} print(item)
def parse_date_url(self, month): print('正在爬取{}月份的数据'.format(month)) result = [] oid = self.get_cid() date_by_month = requests.get(url=self.index_url.format(oid, month), headers=self.date_headers).json().get( 'data') if date_by_month: for day in date_by_month: print('{}月份数据下的{}'.format(month, day)) date_page = requests.get(url=self.date_url.format(oid, day), headers=self.date_headers).content date_data = etree.HTML(date_page) barrage_list = date_data.xpath('//d') for barrage in barrage_list: things = barrage.xpath('./@p')[0].split(',') content = barrage.xpath('./text()')[0].replace(" ", "") item = {'出现时间': things[0], '弹幕模式': things[1], '字体大小': things[2], '颜色': things[3], '发送时间': things[4], '弹幕池': things[5], '用户ID': things[6], 'rowID': things[7], '内容': content} result.append(item) return result
def parse_month(self): start_day = datetime.datetime.strptime(self.get_video_time(), '%Y-%m-%d') end_day = datetime.date.today() months = (end_day.year - start_day.year) * 12 + end_day.month - start_day.month m_list = [] for mon in range(start_day.month - 1, start_day.month + months): if (mon % 12 + 1) < 10: m_list.append('{}-0{}'.format(start_day.year + mon // 12, mon % 12 + 1)) else: m_list.append('{}-{}'.format(start_day.year + mon // 12, mon % 12 + 1)) return m_list
def wordCloud(self): WordCloud(font_path="C:/Windows/Fonts/simfang.ttf", background_color='white', scale=16).generate(" ".join( [c for c in jieba.cut("".join(str((pd.read_csv('{}弹幕池数据集.csv'.format(self.video_name))['内容']).tolist()))) if len(c) > 1])).to_file( "{}词云.png".format(self.video_name))
if __name__ == '__main__': bv_id = input('输入视频对应的bv号:') spider = BarrageSpider(bv_id) spider.parse_month() word_data = [] months = spider.parse_month() for month in months: word = spider.parse_date_url(month) word_data.extend(word) data = pd.DataFrame(word_data) data.drop_duplicates(subset=['rowID'], keep='first') data.to_csv('{}弹幕池数据集.csv'.format(spider.video_name), index=False, encoding='utf-8-sig') spider.wordCloud()
|