




1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| import requests from urllib.parse import urlencode import random
headers = { "Host": "", "Referer": "", "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, " "like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36 Edg/87.0.664.66 ", "X-Requested-With": "XMXLHttpRequest" }
base_url = ''
proxy_pool = [{'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}, {'HTTP': ''}]
weibo = {}
接下来我们进行函数的编写,封装两个函数,用来请求数据 和 解析数据。

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
| def get_page(since_id=None): params = { 'uid': '2946150412', 'luicode': '10000011', 'lfid': '231093_-_selffollowed', 'type': 'uid', 'value': '2946150412', 'containerid': '1076032946150412', 'since_id': since_id } url = base_url + urlencode(params) try: response = requests.get(url=url, headers=headers, proxies=random.choice(proxy_pool)) if response.status_code == 200: json = response.json() next_since_id = json.get('data').get('cardlistInfo').get('since_id') return json, next_since_id except requests.ConnectionError as e: print("错误:", e.args)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
| def parse_page(json): cards = json.get('data').get('cards') for card in cards: mblog = card.get('mblog') if mblog: weibo['source'] = mblog['source'] weibo['created_at'] = mblog['created_at'] weibo['raw_text'] = mblog['raw_text'] weibo['original_pic'] = mblog.get('original_pic') pics = [] p = mblog.get('pics') if p: for pic in p: pics.append(pic['url']) weibo['pics'] = ' , '.join(pics) yield weibo
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
| def domain(): global return_data for i in range(200): if i == 0: print("正在爬取第{}页....".format(i + 1)) return_data = get_page() results = parse_page(return_data[0]) for res in results: img = res.get('original_pic') pics = res.get('pics') if img is None: img = "无" if pics is None: pics = "无" with open('test.txt', 'a', encoding='utf8') as file: file.write('时间:' + res['created_at'] + '\n' + '来源:' + res['source'] + '\n' + '内容:' + res['raw_text'] + '\n' + '附图链接地址:' + img + '\n' + '图床:' + pics + '\n' + '\n') else: print("正在爬取第{}页....".format(i + 1)) return_data = get_page(return_data[1]) results = parse_page(return_data[0]) for res in results: img = res.get('original_pic') pics = res.get('pics') if img is None: img = "无" if pics is None: pics = "无" with open('test.txt', 'a', encoding='utf8') as file: file.write('时间:' + res['created_at'] + '\n' + '来源:' + res['source'] + '\n' + '内容:' + res['raw_text'] + '\n' + '附图链接地址:' + img + '\n' + '图床:' + pics + '\n' + '\n')

