# 获取章节链接 defget_urls(self): response = requests.get(url=self.url, headers=self.headers, proxies=random.choice(proxy_pool)) result = etree.HTML(response.content.decode()) title = result.xpath('//div[@id="info"]/h1/text()')[0] self.bookName = title urls = result.xpath('//div[@class="book_list"]/ul/li') for i in urls: url = 'http://www.biquw.com/book/94/' + i.xpath('./a/@href')[0] self.url_pool.append(url) print('章节链接爬取完毕...')
# 文本内容解析处理 defget_text(self): for url in self.url_pool: data = [] content_page = requests.get(url=url, headers=self.headers, proxies=random.choice(proxy_pool)) result = etree.HTML(content_page.content.decode()) content = result.xpath('//div[@id="htmlContent"]')[0] title = result.xpath('//div[@class="h1title"]/h1/text()')[0] print('正在爬取{}'.format(title)) for t in content.xpath('./text()'): text = t.replace("\n", "").replace("\xa0", "") if text: data.append(' ' + text) txt = '\n'.join(data) withopen('./{}.txt'.format(self.bookName), 'a', encoding='utf-8') as w: w.write(title + '\n' + txt + '\n\n')