1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
| def get_cid(self): return requests.get(url=self.bv_url, headers=self.comment).json()['data'][0]['cid']
def parse_url(self): cid = self.get_cid() response = requests.get(url=self.barrage_url.format(cid), headers=self.comment).content data = etree.HTML(response) barrage_list = data.xpath('//d') for barrage in barrage_list: info = barrage.xpath('./@p')[0].split(',') content = barrage.xpath('./text()')[0] item = {'出现时间': info[0], '弹幕模式': info[1], '字体大小': info[2], '颜色': info[3], '发送时间': info[4], '弹幕池': info[5], '用户ID': info[6], 'rowID': info[7], '内容': content} print(item)
def parse_date_url(self, month): result = [] oid = self.get_cid() date_by_month = requests.get(url=self.index_url.format(oid, month), headers=self.date_headers).json()['data'] for day in date_by_month: date_page = requests.get(url=self.date_url.format(oid, day), headers=self.date_headers).content date_data = etree.HTML(date_page) barrage_list = date_data.xpath('//d') for barrage in barrage_list: things = barrage.xpath('./@p')[0].split(',') content = barrage.xpath('./text()')[0].replace(" ", "") item = {'出现时间': things[0], '弹幕模式': things[1], '字体大小': things[2], '颜色': things[3], '发送时间': things[4], '弹幕池': things[5], '用户ID': things[6], 'rowID': things[7], '内容': content} result.append(item) return result
def wordCloud(self): WordCloud(font_path="C:/Windows/Fonts/simfang.ttf", background_color='white', scale=16).generate(" ".join( [c for c in jieba.cut("".join(str((pd.read_csv('{}弹幕池数据集.csv'.format(self.bv))['内容']).tolist()))) if len(c) > 1])).to_file( "{}词云.png".format(self.bv))
|