zhishiku_bingxs.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. import requests
  2. import re
  3. from plugins import settings
  4. session = requests.Session()
  5. # 正则提取摘要和链接
  6. title_pattern = re.compile('<li class="aca_algo"><h2 class=""><a(.*?)</a>')
  7. brief_pattern = re.compile('<div class="caption_abstract"><p>(.*?)</p>')
  8. link_pattern = re.compile(
  9. '<li class="aca_algo"><h2 class=""><a href="(.*?)" h="')
  10. headers = {
  11. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31'}
  12. proxies = {"http": None,"https": None,}
  13. def find(search_query):
  14. url = 'https://cn.bing.com/academic/search?q={}'.format(search_query)
  15. res = session.get(url, headers=headers ,proxies=proxies)
  16. r = res.text
  17. title = title_pattern.findall(r)
  18. brief = brief_pattern.findall(r)
  19. link = link_pattern.findall(r)
  20. # 数据清洗
  21. clear_brief = []
  22. for i in brief:
  23. tmp = re.sub('<[^<]+?>', '', i).replace('\n', '').strip()
  24. tmp1 = re.sub('^.*&ensp;', '', tmp).replace('\n', '').strip()
  25. tmp2 = re.sub('^.*>', '', tmp1).replace('\n', '').strip()
  26. clear_brief.append(tmp2)
  27. clear_title = []
  28. for i in title:
  29. tmp = re.sub('^.*?>', '', i).replace('\n', '').strip()
  30. tmp2 = re.sub('<[^<]+?>', '', tmp).replace('\n', '').strip()
  31. clear_title.append(tmp2)
  32. # print(clear_title,link,clear_brief)
  33. return [{'title': "["+clear_title[i]+"]("+link[i]+")", 'content':clear_brief[i]}
  34. for i in range(min(settings.chunk_count, len(brief)))]
粤ICP备19079148号