微博–图片,视频,评论抓取

得意时要看淡,失意时要看开。不论得意失意,切莫大意;不论成功失败,切莫止步。志得意满时,需要的是淡然,给自己留一条退路;失意落魄时,需要的是泰然,给自己觅一条出路微博–图片,视频,评论抓取,希望对大家有帮助,欢迎收藏,转发!站点地址:www.bmabk.com,来源:原文

抓取思路:

  1. .手动搜索要抓取的人的主页,进去,浏览器调试找到数据接口
  2. 通过curl工具,自动成成请求代码
  3. 编辑器请求代码,获取json
  4. 解析json,得到发微博人的id,本条微博的id等基础信息,将本条微博的id同步压入队列中,翻页直接循环一个(1,total_num)即可
  5. 用上面解析出来的内容(包括图片,视频链接),对图片,视频进行下载
  6. 通过队列中的微博id,对评论接口发起请求,获取一页(20条)评论,从返回的信息中解析构造新的json,还需要解析一个叫max_id的数据,作为参数请求接下来评论数据,直到微博评论条数为0为止,(这步有大量的请求是无效的,拿不到任何数据)

没有登陆,cookies是curl解析出来的,带不带都能请求到数据,原本以为要登陆,后来通过对接口测试发现,不登陆也能实现抓取,登陆只是前端强制,微博后端接口并未做该限制

缺点:
代码不太严谨,线程开启过多,json处理不够严谨,评论抓取耗时过长,电脑性能不好容易导致IDE崩溃,有时间会用线程池做优化。

# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
@des     :

"""
# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
@des     :

"""
from Project.util import pathways_util
import requests
import os
import time
from threading import Thread
from tqdm import tqdm
import queue


def req_get_data(url, headers=None, cookies=None, params=None, retry=3):
    """
    get请求
    :param url:
    :param headers:
    :param cookies:
    :param params:
    :param retry:重试次数
    :return:
    """
    status = False
    while retry > 0 and not status:
        try:
            response = requests.get(
                url=url,
                headers=headers,
                cookies=cookies,
                params=params)
            time.sleep(1)
            response.encoding = "utf-8"
            status = True
            return response
        except Exception as e:
            retry -= 1
            if retry == 0:
                print(e)


def dowmload_pic(lis):
    """
    图片下载
    :param lis:
    :return:
    """
    parent_path = R"D:\gitlab\baidu\Project\test\KS\pic"
    for url in lis:
        li = url.split("/")
        path1 = li[-2]
        pic_name = li[-1]
        save_path = os.path.join(parent_path, path1)
        pathways_util.create_path(save_path)
        pic_content = req_get_data(url).content
        pic_file = os.path.join(save_path, pic_name)
        # print(pic_file)
        with open(pic_file, "wb")as fp:
            fp.write(pic_content)


def dowmload_video(url):
    """
    视频下载
    :param url:
    :return:
    """

    save_path = R"D:\gitlab\baidu\Project\test\KS\video"
    _li = url.split("?")[0]
    li = _li.split("/")
    pic_name = li[-1]
    # print(os.path.exists(save_path))
    pathways_util.create_path(save_path)
    pic_content = req_get_data(url).content
    pic_file = os.path.join(save_path, pic_name)
    # print(pic_file)
    with open(pic_file, "wb")as fp:
        fp.write(pic_content)


def download(lis):
    """
    下载
    :param lis:
    :return:
    """
    # 图片视频下载
    for i in lis:
        if i["pic_lis"] != "not found":
            # print(i["pic_lis"])
            Thread(target=dowmload_pic, args=(i["pic_lis"],)).start()
            # dowmload_pic(i["pic_lis"])
        if i["pic_largest"] != "not found":
            # print(i["pic_largest"])
            Thread(target=dowmload_pic, args=(i["pic_largest"],)).start()
            # dowmload_pic(i["pic_largest"])
        if i["video_url"] != "not found":
            # print(i["video_url"])
            Thread(target=dowmload_video, args=(i["video_url"],)).start()
            # dowmload_video(i["video_url"])


def run(total_page, uid):
    """
    爬取微博信息
    :return:
    """
    # 微博信息接口地址
    url = "https://weibo.com/ajax/statuses/mymblog"
    for num in tqdm(range(1, total_page + 1)):
        params = {
            "uid": uid,
            "page": "{}".format(num),
            "feature": "0"
        }
        json_response = req_get_data(url, headers, cookies, params).json()

        lis = []
        # print(json_response["data"]["list"])
        # 解析源码,转储json结构
        for i in json_response["data"]["list"]:
            map = {}
            try:
                id = i["id"]
                queue.put(id)
            except BaseException:
                id = "未知"

            try:
                user_id = i["user"]["id"]
            except BaseException:
                user_id = "未知"
            try:
                created_time = i["created_at"]
            except BaseException:
                created_time = "未知"
            try:
                weibo_text = i["text_raw"]
            except BaseException:
                weibo_text = "not found"

            try:
                title = i["page_info"]["content1"]
            except BaseException:
                title = "not found"

            try:
                _title = i["page_info"]["content2"]
            except BaseException:
                _title = "not found"

            try:
                video_url = i["page_info"]["media_info"]["stream_url_hd"]
            except BaseException:
                video_url = "not found"

            try:
                pic_lis = [
                    "https://wx4.sinaimg.cn/orj360/{}.jpg".format(str(i)) for i in i["pic_ids"]]
                if pic_lis == []:
                    pic_lis = "not found"
            except BaseException:
                pic_lis = "not found"

            try:
                pic_largest = [i["largest"]["url"]
                               for i in i["retweeted_status"]["pic_infos"].values()]
                if pic_largest == []:
                    pic_largest = "not found"
            except BaseException:
                pic_largest = "not found"

            map["id"] = id
            map["user_id"] = user_id
            map["created_time"] = created_time
            map["weibo_text"] = weibo_text
            map["title"] = title
            map["_title"] = _title
            map["video_url"] = video_url
            map["pic_largest"] = pic_largest
            map["pic_lis"] = pic_lis
            lis.append(map)

        save_file = os.path.join(save_json_path, str(num) + ".json")
        with open(save_file, "w", encoding="utf-8")as fp:
            fp.write(str(lis))
        # print(lis)
        download(lis)


def run2(article_id, uid, _path):
    """
    评论抓取
    :return:
    """
    global max_id
    url = "https://weibo.com/ajax/statuses/buildComments"
    file = os.path.join(_path, "{}.json".format(article_id))
    lis = []
    params = {
        "flow": "0",
        "is_reload": "1",
        "id": "{}".format(article_id),
        "is_show_bulletin": "2",
        "is_mix": "0",
        "max_id": "{}".format(max_id),
        "count": "20",
        "uid": uid
    }
    try:
        response = req_get_data(url, headers, cookies, params)

        res = response.json()["data"]
        for i in res:
            map = {}
            map["id"] = i["id"]
            map["text_raw"] = i["text_raw"]
            map["description"] = i["user"]["description"]
            map["floor_number"] = i["floor_number"]
            map["location"] = i["user"]["location"]
            map["user_id"] = i["user"]["id"]
            map["username"] = i["user"]["screen_name"]
            map["profile_image_url"] = i["user"]["profile_image_url"]
            map["avatar_hd"] = i["user"]["avatar_hd"]
            lis.append(map)
        if lis != []:
            with open(file, "a", encoding="utf-8")as fp:
                fp.write(str(lis) + '\n')
        else:
            pass
        # print(lis)
        try:
            max_id = response.json()["max_id"]
            # 多线程爬取下页评论
            Thread(target=run2, args=(article_id,)).start()
        except BaseException:
            print("{}下评论抓取完成".format(article_id))
    except BaseException:
        print("数据返回错误错误")


if __name__ == '__main__':
    uid = 1646239802
    total_page = 29

    queue = queue.Queue()
    headers = {
        "authority": "weibo.com",
        "accept": "application/json, text/plain, */*",
        "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "cache-control": "no-cache",
        "client-version": "v2.36.2",
        "dnt": "1",
        "pragma": "no-cache",
        "referer": "https://weibo.com/liyuchun",
        "sec-ch-ua": "\"Google Chrome\";v=\"105\", \"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"105\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "server-version": "v2022.10.09.1",
        # "traceparent": "00-9f3b2454c0810fc61a10ab18d3a3b855-057303b29348c864-00",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
        "x-requested-with": "XMLHttpRequest",
        "x-xsrf-token": "YCUWnOZu2_lCAgOQ-03rUhnb"
    }
    cookies = {
        "SUB": "_2AkMUSRrgf8NxqwJRmP8dymrnZYt3zwDEieKiFes7JRMxHRl-yT9jql4PtRB6P8k0D2oZkvenD_TU_lupH_VyIu2HKt2W",
        "SUBP": "0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWBTlk4q_ADB2gk335lJAyd",
        "SINAGLOBAL": "9008534476428.42.1665208272507",
        "XSRF-TOKEN": "YCUWnOZu2_lCAgOQ-03rUhnb",
        "_s_tentry": "weibo.com",
        "Apache": "3530993333926.089.1665457228837",
        "ULV": "1665457228985:2:2:1:3530993333926.089.1665457228837:1665208272561",
        "WBPSESS": "5fStQf4aE0d6e7rh9d-P6kT2L24ujmwJnUOkWzQKG-MQU8L534-LA9HTDBuvw3r9XO9hYndcirt_F-6AFGpc8XLzuH3spEY8m-xEoVtr8Wh4pEuH7EV06_mDtpV4V9GqKTzoOa8POo0fTtP5kMxMa50keg7bDIDCdqYI7iyEPCY="
    }
    save_json_path = R"D:\gitlab\baidu\Project\test\KS\json"
    pathways_util.create_path(save_json_path)
    run(total_page, uid)

    max_id = 0
    _path = R"D:\gitlab\baidu\Project\test\KS\评论"
    pathways_util.create_path(_path)
    while not queue.empty():
        article_id = queue.get()
        run2(_path, article_id)
        # Thread(target=run2, args=(_path,article_id)).start()


版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。

文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/156888.html

(0)
飞熊的头像飞熊bm

相关推荐

发表回复

登录后才能评论
极客之音——专业性很强的中文编程技术网站,欢迎收藏到浏览器,订阅我们!