Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

pubmed.py 4.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import time
  19. from abc import ABC
  20. from Bio import Entrez
  21. import re
  22. import xml.etree.ElementTree as ET
  23. from agent.tools.base import ToolParamBase, ToolMeta, ToolBase
  24. from api.utils.api_utils import timeout
  25. class PubMedParam(ToolParamBase):
  26. """
  27. Define the PubMed component parameters.
  28. """
  29. def __init__(self):
  30. self.meta:ToolMeta = {
  31. "name": "pubmed_search",
  32. "description": """
  33. PubMed is an openly accessible, free database which includes primarily the MEDLINE database of references and abstracts on life sciences and biomedical topics.
  34. In addition to MEDLINE, PubMed provides access to:
  35. - older references from the print version of Index Medicus, back to 1951 and earlier
  36. - references to some journals before they were indexed in Index Medicus and MEDLINE, for instance Science, BMJ, and Annals of Surgery
  37. - very recent entries to records for an article before it is indexed with Medical Subject Headings (MeSH) and added to MEDLINE
  38. - a collection of books available full-text and other subsets of NLM records[4]
  39. - PMC citations
  40. - NCBI Bookshelf
  41. """,
  42. "parameters": {
  43. "query": {
  44. "type": "string",
  45. "description": "The search keywords to execute with PubMed. The keywords should be the most important words/terms(includes synonyms) from the original request.",
  46. "default": "{sys.query}",
  47. "required": True
  48. }
  49. }
  50. }
  51. super().__init__()
  52. self.top_n = 12
  53. self.email = "A.N.Other@example.com"
  54. def check(self):
  55. self.check_positive_integer(self.top_n, "Top N")
  56. def get_input_form(self) -> dict[str, dict]:
  57. return {
  58. "query": {
  59. "name": "Query",
  60. "type": "line"
  61. }
  62. }
  63. class PubMed(ToolBase, ABC):
  64. component_name = "PubMed"
  65. @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 12))
  66. def _invoke(self, **kwargs):
  67. if not kwargs.get("query"):
  68. self.set_output("formalized_content", "")
  69. return ""
  70. last_e = ""
  71. for _ in range(self._param.max_retries+1):
  72. try:
  73. Entrez.email = self._param.email
  74. pubmedids = Entrez.read(Entrez.esearch(db='pubmed', retmax=self._param.top_n, term=kwargs["query"]))['IdList']
  75. pubmedcnt = ET.fromstring(re.sub(r'<(/?)b>|<(/?)i>', '', Entrez.efetch(db='pubmed', id=",".join(pubmedids),
  76. retmode="xml").read().decode("utf-8")))
  77. self._retrieve_chunks(pubmedcnt.findall("PubmedArticle"),
  78. get_title=lambda child: child.find("MedlineCitation").find("Article").find("ArticleTitle").text,
  79. get_url=lambda child: "https://pubmed.ncbi.nlm.nih.gov/" + child.find("MedlineCitation").find("PMID").text,
  80. get_content=lambda child: child.find("MedlineCitation") \
  81. .find("Article") \
  82. .find("Abstract") \
  83. .find("AbstractText").text \
  84. if child.find("MedlineCitation")\
  85. .find("Article").find("Abstract") \
  86. else "No abstract available")
  87. return self.output("formalized_content")
  88. except Exception as e:
  89. last_e = e
  90. logging.exception(f"PubMed error: {e}")
  91. time.sleep(self._param.delay_after_error)
  92. if last_e:
  93. self.set_output("_ERROR", str(last_e))
  94. return f"PubMed error: {last_e}"
  95. assert False, self.output()
  96. def thoughts(self) -> str:
  97. return "Looking for scholarly papers on `{}`,” prioritising reputable sources.".format(self.get_input().get("query", "-_-!"))