zhishiku_bingfull.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import requests
  2. import re
  3. from bs4 import BeautifulSoup
  4. from plugins.common import settings
  5. import selenium
  6. from selenium import webdriver
  7. from selenium.webdriver.support.wait import WebDriverWait
  8. from selenium.webdriver.chrome.service import Service
  9. from selenium.webdriver.common.keys import Keys
  10. from selenium.webdriver.common.by import By
  11. import time
  12. session = requests.Session()
  13. # 正则提取摘要和链接
  14. title_pattern = re.compile('<a.target=..blank..target..(.*?)</a>')
  15. brief_pattern = re.compile('K=.SERP(.*?)</p>')
  16. link_pattern = re.compile(
  17. '(?<=(a.target=._blank..target=._blank..href=.))(.*?)(?=(..h=))')
  18. headers = {
  19. "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0',
  20. "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  21. "Accept-Language" : "zh-CN,zh;q=0.9",
  22. "Accept-Encoding" : "gzip, deflate, br",
  23. "DNT" : "1",
  24. "Connection" : "cloes"
  25. }
  26. proxies = {'https':None, 'http':None,}
  27. def countchn(string):
  28. pattern = re.compile(u'[\u1100-\uFFFDh]+?')
  29. result = pattern.findall(string)
  30. chnnum = len(result) #list的长度即是中文的字数
  31. possible = chnnum/len(str(string)) #possible = 中文字数/总字数
  32. return (chnnum, possible)
  33. def findtext(part):
  34. length = 100000
  35. l = []
  36. for paragraph in part:
  37. chnstatus = countchn(str(paragraph))
  38. possible = chnstatus[1]
  39. if possible > 0.05:
  40. l.append(paragraph)
  41. l_t = l[:]
  42. paragraph_f = []
  43. #这里需要复制一下表,在新表中再次筛选,要不然会出问题,跟Python的内存机制有关
  44. for elements in l_t:
  45. chnstatus = countchn(str(elements))
  46. chnnum2 = chnstatus[0]
  47. if chnnum2 < 300:
  48. #最终测试结果表明300字是一个比较靠谱的标准,低于300字的正文咱也不想要了对不
  49. l.remove(elements)
  50. elif len(str(elements))<length:
  51. # length = len(str(elements))
  52. # paragraph_f = elements
  53. paragraph_f.append(elements.text)
  54. return paragraph_f
  55. # return max(paragraph_f, key=len, default='') # 返回最长的字符串
  56. def selenium_read(url):
  57. options = webdriver.ChromeOptions()
  58. options.add_experimental_option("excludeSwitches", ['enable-automation'])
  59. options.add_experimental_option('useAutomationExtension', False)
  60. options.add_argument('headless')
  61. options.add_argument('--disable-gpu')
  62. ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'
  63. options.add_argument(f'user-agent={ua}')
  64. options.headless = False # 阻止弹出浏览器窗口,静默执行
  65. service = Service(executable_path=".\\chromedriver_win.exe") # driver文件位置
  66. driver = webdriver.Chrome(service=service, options=options)
  67. driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { # 去除selenium指纹
  68. "source": """
  69. Object.defineProperty(navigator, 'webdriver', {
  70. get: () => undefined
  71. })
  72. """
  73. })
  74. try:
  75. driver.get(url) # 这次返回的是 521 相关的防爬js代码
  76. driver.implicitly_wait(1)
  77. time.sleep(0.2) # 多次等待模拟真实浏览器
  78. cookie = driver.get_cookies()
  79. time.sleep(0.2)
  80. for cc in range(0, len(cookie)): # 使用动态cookies
  81. driver.add_cookie(cookie[cc])
  82. time.sleep(0.1)
  83. driver.get(url) # 调用2次 browser.get 解决 521 问题
  84. time.sleep(0.2)
  85. html = driver.page_source
  86. driver.quit()
  87. return html
  88. except:
  89. return "Connection Timeout"
  90. def find(search_query,step = 0,max_title = 5):
  91. # max_title:最多点击的条目数,由于爬取网站耗时较多,不建议设置的太大
  92. url = 'https://cn.bing.com/search?q={}'.format(search_query)
  93. res = session.get(url, headers=headers, proxies=proxies)
  94. r = res.text
  95. title = title_pattern.findall(r)
  96. brief = brief_pattern.findall(r)
  97. link = link_pattern.findall(r)
  98. # 数据清洗
  99. clear_content = []
  100. for i in range(min(len(link), int(max_title))):
  101. content_r = ""
  102. try:
  103. href = link[i][1]
  104. html_text = selenium_read(href)
  105. soup = BeautifulSoup(html_text, 'html.parser')
  106. part = soup.select('div')
  107. para_f = findtext(part)
  108. except:
  109. para_f = []
  110. tmp = re.sub('<[^<]+?>', '', brief[i]).replace('\n', '').strip()
  111. tmp = re.sub('^.*&ensp;', '', tmp).replace('\n', '').strip()
  112. brief_r = re.sub('^.*>', '', tmp).replace('\n', '').strip()
  113. matched_indicator = brief_r[0:5]
  114. ct = brief_r
  115. for content_r in para_f:
  116. tmp = re.sub('<[^<]+?>', '', content_r).replace('\n', '').strip()
  117. tmp = re.sub('^.*&ensp;', '', tmp).replace('\n', '').strip()
  118. content_r = re.sub('^.*>', '', tmp).replace('\n', '').strip()
  119. idx = content_r.find(matched_indicator)
  120. if idx != -1:
  121. start_idx = int(max(idx-800, 0))
  122. end_idx = int(min(idx+1800, len(content_r)))
  123. ct = content_r[start_idx:end_idx]
  124. break
  125. print(str(i+1)+"/"+str(len(link)))
  126. print(link[i][1])
  127. print(ct)
  128. clear_content.append(ct)
  129. clear_title = []
  130. for i in range(min(len(title), int(max_title))):
  131. title_i = title[i]
  132. tmp = re.sub('^.*?>', '', title_i).replace('\n', '').strip()
  133. tmp2 = re.sub('<[^<]+?>', '', tmp).replace('\n', '').strip()
  134. clear_title.append(tmp2)
  135. return [{'title': "["+clear_title[i]+"]("+link[i][1]+")", 'content':clear_content[i]}
  136. for i in range(min(int(settings.librarys.bing.count), len(clear_content)))]
粤ICP备19079148号