| @@ -89,22 +89,6 @@ class IndexingRunner: | |||
| dataset_document.stopped_at = datetime.datetime.utcnow() | |||
| db.session.commit() | |||
| def format_split_text(self, text): | |||
| regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)" | |||
| matches = re.findall(regex, text, re.MULTILINE) | |||
| result = [] | |||
| for match in matches: | |||
| q = match[0] | |||
| a = match[1] | |||
| if q and a: | |||
| result.append({ | |||
| "question": q, | |||
| "answer": re.sub(r"\n\s*", "\n", a.strip()) | |||
| }) | |||
| return result | |||
| def run_in_splitting_status(self, dataset_document: DatasetDocument): | |||
| """Run the indexing process when the index_status is splitting.""" | |||
| try: | |||
| @@ -647,21 +631,16 @@ class IndexingRunner: | |||
| return text | |||
| def format_split_text(self, text): | |||
| regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)" # 匹配Q和A的正则表达式 | |||
| matches = re.findall(regex, text, re.MULTILINE) # 获取所有匹配到的结果 | |||
| result = [] # 存储最终的结果 | |||
| for match in matches: | |||
| q = match[0] | |||
| a = match[1] | |||
| if q and a: | |||
| # 如果Q和A都存在,就将其添加到结果中 | |||
| result.append({ | |||
| "question": q, | |||
| "answer": re.sub(r"\n\s*", "\n", a.strip()) | |||
| }) | |||
| return result | |||
| regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)" | |||
| matches = re.findall(regex, text, re.MULTILINE) | |||
| return [ | |||
| { | |||
| "question": q, | |||
| "answer": re.sub(r"\n\s*", "\n", a.strip()) | |||
| } | |||
| for q, a in matches if q and a | |||
| ] | |||
| def _build_index(self, dataset: Dataset, dataset_document: DatasetDocument, documents: List[Document]) -> None: | |||
| """ | |||