微博–图片，视频，评论抓取

抓取思路：

.手动搜索要抓取的人的主页，进去，浏览器调试找到数据接口
通过curl工具，自动成成请求代码
编辑器请求代码，获取json
解析json，得到发微博人的id,本条微博的id等基础信息，将本条微博的id同步压入队列中，翻页直接循环一个(1,total_num)即可
用上面解析出来的内容（包括图片，视频链接），对图片，视频进行下载
通过队列中的微博id，对评论接口发起请求，获取一页（20条）评论，从返回的信息中解析构造新的json，还需要解析一个叫max_id的数据，作为参数请求接下来评论数据，直到微博评论条数为0为止，（这步有大量的请求是无效的，拿不到任何数据）

没有登陆，cookies是curl解析出来的，带不带都能请求到数据，原本以为要登陆，后来通过对接口测试发现，不登陆也能实现抓取，登陆只是前端强制，微博后端接口并未做该限制

缺点：
代码不太严谨，线程开启过多，json处理不够严谨，评论抓取耗时过长，电脑性能不好容易导致IDE崩溃，有时间会用线程池做优化。

# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
@des     :

"""
# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
@des     :

"""
from Project.util import pathways_util
import requests
import os
import time
from threading import Thread
from tqdm import tqdm
import queue


def req_get_data(url, headers=None, cookies=None, params=None, retry=3):
    """
    get请求
    :param url:
    :param headers:
    :param cookies:
    :param params:
    :param retry:重试次数
    :return:
    """
    status = False
    while retry > 0 and not status:
        try:
            response = requests.get(
                url=url,
                headers=headers,
                cookies=cookies,
                params=params)
            time.sleep(1)
            response.encoding = "utf-8"
            status = True
            return response
        except Exception as e:
            retry -= 1
            if retry == 0:
                print(e)


def dowmload_pic(lis):
    """
    图片下载
    :param lis:
    :return:
    """
    parent_path = R"D:\gitlab\baidu\Project\test\KS\pic"
    for url in lis:
        li = url.split("/")
        path1 = li[-2]
        pic_name = li[-1]
        save_path = os.path.join(parent_path, path1)
        pathways_util.create_path(save_path)
        pic_content = req_get_data(url).content
        pic_file = os.path.join(save_path, pic_name)
        # print(pic_file)
        with open(pic_file, "wb")as fp:
            fp.write(pic_content)


def dowmload_video(url):
    """
    视频下载
    :param url:
    :return:
    """

    save_path = R"D:\gitlab\baidu\Project\test\KS\video"
    _li = url.split("?")[0]
    li = _li.split("/")
    pic_name = li[-1]
    # print(os.path.exists(save_path))
    pathways_util.create_path(save_path)
    pic_content = req_get_data(url).content
    pic_file = os.path.join(save_path, pic_name)
    # print(pic_file)
    with open(pic_file, "wb")as fp:
        fp.write(pic_content)


def download(lis):
    """
    下载
    :param lis:
    :return:
    """
    # 图片视频下载
    for i in lis:
        if i["pic_lis"] != "not found":
            # print(i["pic_lis"])
            Thread(target=dowmload_pic, args=(i["pic_lis"],)).start()
            # dowmload_pic(i["pic_lis"])
        if i["pic_largest"] != "not found":
            # print(i["pic_largest"])
            Thread(target=dowmload_pic, args=(i["pic_largest"],)).start()
            # dowmload_pic(i["pic_largest"])
        if i["video_url"] != "not found":
            # print(i["video_url"])
            Thread(target=dowmload_video, args=(i["video_url"],)).start()
            # dowmload_video(i["video_url"])


def run(total_page, uid):
    """
    爬取微博信息
    :return:
    """
    # 微博信息接口地址
    url = "https://weibo.com/ajax/statuses/mymblog"
    for num in tqdm(range(1, total_page + 1)):
        params = {
            "uid": uid,
            "page": "{}".format(num),
            "feature": "0"
        }
        json_response = req_get_data(url, headers, cookies, params).json()

        lis = []
        # print(json_response["data"]["list"])
        # 解析源码，转储json结构
        for i in json_response["data"]["list"]:
            map = {}
            try:
                id = i["id"]
                queue.put(id)
            except BaseException:
                id = "未知"

            try:
                user_id = i["user"]["id"]
            except BaseException:
                user_id = "未知"
            try:
                created_time = i["created_at"]
            except BaseException:
                created_time = "未知"
            try:
                weibo_text = i["text_raw"]
            except BaseException:
                weibo_text = "not found"

            try:
                title = i["page_info"]["content1"]
            except BaseException:
                title = "not found"

            try:
                _title = i["page_info"]["content2"]
            except BaseException:
                _title = "not found"

            try:
                video_url = i["page_info"]["media_info"]["stream_url_hd"]
            except BaseException:
                video_url = "not found"

            try:
                pic_lis = [
                    "https://wx4.sinaimg.cn/orj360/{}.jpg".format(str(i)) for i in i["pic_ids"]]
                if pic_lis == []:
                    pic_lis = "not found"
            except BaseException:
                pic_lis = "not found"

            try:
                pic_largest = [i["largest"]["url"]
                               for i in i["retweeted_status"]["pic_infos"].values()]
                if pic_largest == []:
                    pic_largest = "not found"
            except BaseException:
                pic_largest = "not found"

            map["id"] = id
            map["user_id"] = user_id
            map["created_time"] = created_time
            map["weibo_text"] = weibo_text
            map["title"] = title
            map["_title"] = _title
            map["video_url"] = video_url
            map["pic_largest"] = pic_largest
            map["pic_lis"] = pic_lis
            lis.append(map)

        save_file = os.path.join(save_json_path, str(num) + ".json")
        with open(save_file, "w", encoding="utf-8")as fp:
            fp.write(str(lis))
        # print(lis)
        download(lis)


def run2(article_id, uid, _path):
    """
    评论抓取
    :return:
    """
    global max_id
    url = "https://weibo.com/ajax/statuses/buildComments"
    file = os.path.join(_path, "{}.json".format(article_id))
    lis = []
    params = {
        "flow": "0",
        "is_reload": "1",
        "id": "{}".format(article_id),
        "is_show_bulletin": "2",
        "is_mix": "0",
        "max_id": "{}".format(max_id),
        "count": "20",
        "uid": uid
    }
    try:
        response = req_get_data(url, headers, cookies, params)

        res = response.json()["data"]
        for i in res:
            map = {}
            map["id"] = i["id"]
            map["text_raw"] = i["text_raw"]
            map["description"] = i["user"]["description"]
            map["floor_number"] = i["floor_number"]
            map["location"] = i["user"]["location"]
            map["user_id"] = i["user"]["id"]
            map["username"] = i["user"]["screen_name"]
            map["profile_image_url"] = i["user"]["profile_image_url"]
            map["avatar_hd"] = i["user"]["avatar_hd"]
            lis.append(map)
        if lis != []:
            with open(file, "a", encoding="utf-8")as fp:
                fp.write(str(lis) + '\n')
        else:
            pass
        # print(lis)
        try:
            max_id = response.json()["max_id"]
            # 多线程爬取下页评论
            Thread(target=run2, args=(article_id,)).start()
        except BaseException:
            print("{}下评论抓取完成".format(article_id))
    except BaseException:
        print("数据返回错误错误")


if __name__ == '__main__':
    uid = 1646239802
    total_page = 29

    queue = queue.Queue()
    headers = {
        "authority": "weibo.com",
        "accept": "application/json, text/plain, */*",
        "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "cache-control": "no-cache",
        "client-version": "v2.36.2",
        "dnt": "1",
        "pragma": "no-cache",
        "referer": "https://weibo.com/liyuchun",
        "sec-ch-ua": "\"Google Chrome\";v=\"105\", \"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"105\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "server-version": "v2022.10.09.1",
        # "traceparent": "00-9f3b2454c0810fc61a10ab18d3a3b855-057303b29348c864-00",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
        "x-requested-with": "XMLHttpRequest",
        "x-xsrf-token": "YCUWnOZu2_lCAgOQ-03rUhnb"
    }
    cookies = {
        "SUB": "_2AkMUSRrgf8NxqwJRmP8dymrnZYt3zwDEieKiFes7JRMxHRl-yT9jql4PtRB6P8k0D2oZkvenD_TU_lupH_VyIu2HKt2W",
        "SUBP": "0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWBTlk4q_ADB2gk335lJAyd",
        "SINAGLOBAL": "9008534476428.42.1665208272507",
        "XSRF-TOKEN": "YCUWnOZu2_lCAgOQ-03rUhnb",
        "_s_tentry": "weibo.com",
        "Apache": "3530993333926.089.1665457228837",
        "ULV": "1665457228985:2:2:1:3530993333926.089.1665457228837:1665208272561",
        "WBPSESS": "5fStQf4aE0d6e7rh9d-P6kT2L24ujmwJnUOkWzQKG-MQU8L534-LA9HTDBuvw3r9XO9hYndcirt_F-6AFGpc8XLzuH3spEY8m-xEoVtr8Wh4pEuH7EV06_mDtpV4V9GqKTzoOa8POo0fTtP5kMxMa50keg7bDIDCdqYI7iyEPCY="
    }
    save_json_path = R"D:\gitlab\baidu\Project\test\KS\json"
    pathways_util.create_path(save_json_path)
    run(total_page, uid)

    max_id = 0
    _path = R"D:\gitlab\baidu\Project\test\KS\评论"
    pathways_util.create_path(_path)
    while not queue.empty():
        article_id = queue.get()
        run2(_path, article_id)
        # Thread(target=run2, args=(_path,article_id)).start()

文章由极客之音整理，本文链接：https://www.bmabk.com/index.php/post/156888.html

微博–图片，视频，评论抓取

相关推荐

发表回复