Twitter 热门搜索结果文本抓取

得意时要看淡,失意时要看开。不论得意失意,切莫大意;不论成功失败,切莫止步。志得意满时,需要的是淡然,给自己留一条退路;失意落魄时,需要的是泰然,给自己觅一条出路Twitter 热门搜索结果文本抓取,希望对大家有帮助,欢迎收藏,转发!站点地址:www.bmabk.com,来源:原文

"""
twitter 热门频道搜索,数据动态加载,前两页都是20条,往下是新接口,一次刷新2条新数据

推荐个工具,复制curl就能快速生成requests请求样例代码,自动解析header,cookie参数,非常实用
https://spidertools.cn/#/formatDict
"""


import requests
import time
from threading import Thread
from queue import Queue

def get_data():
	"""
	列表页1-2
	"""
    headers = {
        "authority": "twitter.com",
        "accept": "*/*",
        "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
        "cache-control": "no-cache",
        "dnt": "1",
        "pragma": "no-cache",
        "referer": "https://twitter.com/search?q=%22submit%20office%22&src=typed_query&f=top",
        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
        "x-csrf-token": "3cc063510474507c7e1ae35576e53194",
        "x-guest-token": "1592748747951595520",
        "x-twitter-active-user": "yes",
        "x-twitter-client-language": "zh-cn"
    }
    cookies = {
        "guest_id_marketing": "v1%3A166841527748318144",
        "guest_id_ads": "v1%3A166841527748318144",
        "personalization_id": "\"v1_SIcw4PySge1De0L5TotiuQ==\"",
        "guest_id": "v1%3A166841527748318144",
        "external_referer": "padhuUp37zhsl55Izsa7f%2F2wsNVnW83D|0|8e8t2xd8A2w%3D",
        "_gid": "GA1.2.360851272.1668415299",
        "ct0": "3cc063510474507c7e1ae35576e53194",
        "g_state": "{\"i_p\":1669170412369,\"i_l\":3}",
        "at_check": "true",
        "mbox": "session#f303cf2ab1ad427e93eef340c6d1e79d#1668567608|PC#f303cf2ab1ad427e93eef340c6d1e79d.32_0#1731810548",
        "_ga_BYKEBDM7DS": "GS1.1.1668565653.1.1.1668565749.0.0.0",
        "_ga": "GA1.2.1925896343.1668415299",
        "gt": "1592748747951595520"
    }
    url = "https://twitter.com/i/api/2/search/adaptive.json"
    params = {
        "include_profile_interstitial_type": "1",
        "include_blocking": "1",
        "include_blocked_by": "1",
        "include_followed_by": "1",
        "include_want_retweets": "1",
        "include_mute_edge": "1",
        "include_can_dm": "1",
        "include_can_media_tag": "1",
        "include_ext_has_nft_avatar": "1",
        "include_ext_is_blue_verified": "1",
        "skip_status": "1",
        "cards_platform": "Web-12",
        "include_cards": "1",
        "include_ext_alt_text": "true",
        "include_ext_limited_action_results": "false",
        "include_quote_count": "true",
        "include_reply_count": "1",
        "tweet_mode": "extended",
        "include_ext_collab_control": "true",
        "include_entities": "true",
        "include_user_entities": "true",
        "include_ext_media_color": "true",
        "include_ext_media_availability": "true",
        "include_ext_sensitive_media_warning": "true",
        "include_ext_trusted_friends_metadata": "true",
        "send_error_codes": "true",
        "simple_quoted_tweet": "true",
        "q": "\"submit office\"",
        "count": "20",
        "query_source": "typed_query",
        "pc": "1",
        "spelling_corrections": "1",
        "include_ext_edit_control": "true",
        "ext": "mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo,editControl,collab_control,vibe"
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params)

    # print(response.json())
    # print(response)

    return response.json()["timeline"]["instructions"][0]["addEntries"]["entries"][21]["content"]["operation"]["cursor"]["value"],list(response.json()["globalObjects"]["tweets"].keys())



def get_detail(tw_id_lis):
	"""
	详情页
	"""
    # print("tw_id_lis",tw_id_lis)
    headers = {
        "authority": "twitter.com",
        "accept": "*/*",
        "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
        "cache-control": "no-cache",
        "content-type": "application/json",
        "dnt": "1",
        "pragma": "no-cache",
        "referer": "https://twitter.com/BMo3JQdAcg/status/402671294547648512",
        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
        "x-csrf-token": "3cc063510474507c7e1ae35576e53194",
        "x-guest-token": "1592748747951595520",
        "x-twitter-active-user": "yes",
        "x-twitter-client-language": "zh-cn"
    }
    cookies = {
        "guest_id_marketing": "v1%3A166841527748318144",
        "guest_id_ads": "v1%3A166841527748318144",
        "personalization_id": "\"v1_SIcw4PySge1De0L5TotiuQ==\"",
        "guest_id": "v1%3A166841527748318144",
        "external_referer": "padhuUp37zhsl55Izsa7f%2F2wsNVnW83D|0|8e8t2xd8A2w%3D",
        "_gid": "GA1.2.360851272.1668415299",
        "ct0": "3cc063510474507c7e1ae35576e53194",
        "g_state": "{\"i_p\":1669170412369,\"i_l\":3}",
        "at_check": "true",
        "mbox": "session#f303cf2ab1ad427e93eef340c6d1e79d#1668567608|PC#f303cf2ab1ad427e93eef340c6d1e79d.32_0#1731810548",
        "_ga_BYKEBDM7DS": "GS1.1.1668565653.1.1.1668565749.0.0.0",
        "_ga": "GA1.2.1925896343.1668415299",
        "gt": "1592748747951595520"
    }
    url = "https://twitter.com/i/api/graphql/BoHLKeBvibdYDiJON1oqTg/TweetDetail"
    for twid in tw_id_lis:
        # print("twid",twid)
        params = {
            "variables": "{\"focalTweetId\":\""+twid+"\","
                         "\"referrer\":\"search\",\"with_rux_injections\":false,\"includePromotedContent\":true,\"withCommunity\":true,\"withQuickPromoteEligibilityTweetFields\":true,\"withBirdwatchNotes\":false,\"withSuperFollowsUserFields\":true,\"withDownvotePerspective\":false,\"withReactionsMetadata\":false,\"withReactionsPerspective\":false,\"withSuperFollowsTweetFields\":true,\"withVoice\":true,\"withV2Timeline\":true}",
            "features": "{\"responsive_web_twitter_blue_verified_badge_is_enabled\":true,\"verified_phone_label_enabled\":false,\"responsive_web_graphql_timeline_navigation_enabled\":true,\"unified_cards_ad_metadata_container_dynamic_card_content_query_enabled\":true,\"tweetypie_unmention_optimization_enabled\":true,\"responsive_web_uc_gql_enabled\":true,\"vibe_api_enabled\":true,\"responsive_web_edit_tweet_api_enabled\":true,\"graphql_is_translatable_rweb_tweet_is_translatable_enabled\":true,\"standardized_nudges_misinfo\":true,\"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled\":false,\"interactive_text_enabled\":true,\"responsive_web_text_conversations_enabled\":false,\"responsive_web_enhance_cards_enabled\":true}"
        }
        # print(params)
        response = requests.get(url, headers=headers, cookies=cookies, params=params)
        # print("response.json()",response.json())
        try:
            print(response.json()["data"]["threaded_conversation_with_injections_v2"]["instructions"][0]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"]["legacy"]["full_text"].strip())
            que.put(response.json()["data"]["threaded_conversation_with_injections_v2"]["instructions"][0]["entries"][0]["content"]["itemContent"]["tweet_results"]["result"]["legacy"]["full_text"].strip())

        except Exception as e:
            print(e,e.__traceback__.tb_lineno)


def get_data2(cursor):
	"""
	列表页2-……
	"""
    headers = {
        "authority": "twitter.com",
        "accept": "*/*",
        "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
        "cache-control": "no-cache",
        "dnt": "1",
        "pragma": "no-cache",
        "referer": "https://twitter.com/search?q=%22submit%20office%22&src=typed_query&f=top",
        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
        "x-csrf-token": "3cc063510474507c7e1ae35576e53194",
        "x-guest-token": "1592748747951595520",
        "x-twitter-active-user": "yes",
        "x-twitter-client-language": "zh-cn"
    }
    cookies = {
        "guest_id_marketing": "v1%3A166841527748318144",
        "guest_id_ads": "v1%3A166841527748318144",
        "personalization_id": "\"v1_SIcw4PySge1De0L5TotiuQ==\"",
        "guest_id": "v1%3A166841527748318144",
        "external_referer": "padhuUp37zhsl55Izsa7f%2F2wsNVnW83D|0|8e8t2xd8A2w%3D",
        "_gid": "GA1.2.360851272.1668415299",
        "ct0": "3cc063510474507c7e1ae35576e53194",
        "g_state": "{\"i_p\":1669170412369,\"i_l\":3}",
        "at_check": "true",
        "mbox": "session#f303cf2ab1ad427e93eef340c6d1e79d#1668567608|PC#f303cf2ab1ad427e93eef340c6d1e79d.32_0#1731810548",
        "_ga_BYKEBDM7DS": "GS1.1.1668565653.1.1.1668565749.0.0.0",
        "_ga": "GA1.2.1925896343.1668415299",
        "gt": "1592748747951595520"
    }
    url = "https://twitter.com/i/api/2/search/adaptive.json"
    params = {
        "include_profile_interstitial_type": "1",
        "include_blocking": "1",
        "include_blocked_by": "1",
        "include_followed_by": "1",
        "include_want_retweets": "1",
        "include_mute_edge": "1",
        "include_can_dm": "1",
        "include_can_media_tag": "1",
        "include_ext_has_nft_avatar": "1",
        "include_ext_is_blue_verified": "1",
        "skip_status": "1",
        "cards_platform": "Web-12",
        "include_cards": "1",
        "include_ext_alt_text": "true",
        "include_ext_limited_action_results": "false",
        "include_quote_count": "true",
        "include_reply_count": "1",
        "tweet_mode": "extended",
        "include_ext_collab_control": "true",
        "include_entities": "true",
        "include_user_entities": "true",
        "include_ext_media_color": "true",
        "include_ext_media_availability": "true",
        "include_ext_sensitive_media_warning": "true",
        "include_ext_trusted_friends_metadata": "true",
        "send_error_codes": "true",
        "simple_quoted_tweet": "true",
        "q": "\"submit office\"",
        "count": "20",
        "query_source": "typed_query",
        "cursor":"{}".format(cursor),
        "pc": "1",
        "spelling_corrections": "1",
        "include_ext_edit_control": "true",
        "ext": "mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo,editControl,collab_control,vibe"
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params)
    # print("response.json()",response.json())
    return \
    response.json()["timeline"]["instructions"][2]["replaceEntry"]["entry"]["content"]["operation"]["cursor"][
        "value"], list(response.json()["globalObjects"]["tweets"].keys())


if __name__ == '__main__':
    que = Queue()

    cursor = ""
    num = 0
    while True:
        num += 1
        if num > 2:
            # print("1" * 101)
            cursor, tw_id_lis = get_data2(cursor)
            if not tw_id_lis:
                break
        else:
            cursor,tw_id_lis = get_data()
        time.sleep(1)
        Thread(target=get_detail,args=(tw_id_lis,)).run()

    # 保存结果
    fp = open("./res.txt", "a", encoding="utf-8")
    while not que.empty():
        fp.write(que.get()+"\n")

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。

文章由极客之音整理,本文链接:https://www.bmabk.com/index.php/post/156884.html

(0)
飞熊的头像飞熊bm

相关推荐

发表回复

登录后才能评论
极客之音——专业性很强的中文编程技术网站,欢迎收藏到浏览器,订阅我们!