You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

googlescholar.py 4.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import time
  19. from abc import ABC
  20. from scholarly import scholarly
  21. from agent.tools.base import ToolMeta, ToolParamBase, ToolBase
  22. from api.utils.api_utils import timeout
  23. class GoogleScholarParam(ToolParamBase):
  24. """
  25. Define the GoogleScholar component parameters.
  26. """
  27. def __init__(self):
  28. self.meta:ToolMeta = {
  29. "name": "google_scholar_search",
  30. "description": """Google Scholar provides a simple way to broadly search for scholarly literature. From one place, you can search across many disciplines and sources: articles, theses, books, abstracts and court opinions, from academic publishers, professional societies, online repositories, universities and other web sites. Google Scholar helps you find relevant work across the world of scholarly research.""",
  31. "parameters": {
  32. "query": {
  33. "type": "string",
  34. "description": "The search keyword to execute with Google Scholar. The keywords should be the most important words/terms(includes synonyms) from the original request.",
  35. "default": "{sys.query}",
  36. "required": True
  37. }
  38. }
  39. }
  40. super().__init__()
  41. self.top_n = 12
  42. self.sort_by = 'relevance'
  43. self.year_low = None
  44. self.year_high = None
  45. self.patents = True
  46. def check(self):
  47. self.check_positive_integer(self.top_n, "Top N")
  48. self.check_valid_value(self.sort_by, "GoogleScholar Sort_by", ['date', 'relevance'])
  49. self.check_boolean(self.patents, "Whether or not to include patents, defaults to True")
  50. def get_input_form(self) -> dict[str, dict]:
  51. return {
  52. "query": {
  53. "name": "Query",
  54. "type": "line"
  55. }
  56. }
  57. class GoogleScholar(ToolBase, ABC):
  58. component_name = "GoogleScholar"
  59. @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 12))
  60. def _invoke(self, **kwargs):
  61. if not kwargs.get("query"):
  62. self.set_output("formalized_content", "")
  63. return ""
  64. last_e = ""
  65. for _ in range(self._param.max_retries+1):
  66. try:
  67. scholar_client = scholarly.search_pubs(kwargs["query"], patents=self._param.patents, year_low=self._param.year_low,
  68. year_high=self._param.year_high, sort_by=self._param.sort_by)
  69. self._retrieve_chunks(scholar_client,
  70. get_title=lambda r: r['bib']['title'],
  71. get_url=lambda r: r["pub_url"],
  72. get_content=lambda r: "\n author: " + ",".join(r['bib']['author']) + '\n Abstract: ' + r['bib'].get('abstract', 'no abstract')
  73. )
  74. self.set_output("json", list(scholar_client))
  75. return self.output("formalized_content")
  76. except Exception as e:
  77. last_e = e
  78. logging.exception(f"GoogleScholar error: {e}")
  79. time.sleep(self._param.delay_after_error)
  80. if last_e:
  81. self.set_output("_ERROR", str(last_e))
  82. return f"GoogleScholar error: {last_e}"
  83. assert False, self.output()
  84. def thoughts(self) -> str:
  85. return "Looking for scholarly papers on `{}`,” prioritising reputable sources.".format(self.get_input().get("query", "-_-!"))