| @@ -88,8 +88,8 @@ If your machine doesn't have *Docker* installed, please refer to [Install Docker | |||
| > In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_. | |||
| > It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system. | |||
| > - We have supported the flowing LLM factory, and the others is coming soon: | |||
| > [OpenAI](https://platform.openai.com/login?launch), [通义千问/QWen](https://dashscope.console.aliyun.com/model), | |||
| > [智谱AI/ZhipuAI](https://open.bigmodel.cn/) | |||
| > [OpenAI](https://platform.openai.com/login?launch), [Tongyi-Qianwen](https://dashscope.console.aliyun.com/model), | |||
| > [ZHIPU-AI](https://open.bigmodel.cn/), [Moonshot](https://platform.moonshot.cn/docs/docs) | |||
| ```bash | |||
| 121:/# git clone https://github.com/infiniflow/ragflow.git | |||
| 121:/# cd ragflow/docker | |||
| @@ -79,3 +79,4 @@ class ParserType(StrEnum): | |||
| TABLE = "table" | |||
| NAIVE = "naive" | |||
| PICTURE = "picture" | |||
| ONE = "one" | |||
| @@ -79,12 +79,12 @@ factory_infos = [{ | |||
| "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | |||
| "status": "1", | |||
| },{ | |||
| "name": "通义千问", | |||
| "name": "Tongyi-Qianwen", | |||
| "logo": "", | |||
| "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | |||
| "status": "1", | |||
| },{ | |||
| "name": "智谱AI", | |||
| "name": "ZHIPU-AI", | |||
| "logo": "", | |||
| "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | |||
| "status": "1", | |||
| @@ -270,6 +270,14 @@ def init_llm_factory(): | |||
| except Exception as e: | |||
| pass | |||
| """ | |||
| drop table llm; | |||
| drop table factories; | |||
| update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问'; | |||
| update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI'; | |||
| update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture'; | |||
| """ | |||
| def init_web_data(): | |||
| start_time = time.time() | |||
| @@ -52,7 +52,7 @@ REQUEST_MAX_WAIT_SEC = 300 | |||
| USE_REGISTRY = get_base_config("use_registry") | |||
| default_llm = { | |||
| "通义千问": { | |||
| "Tongyi-Qianwen": { | |||
| "chat_model": "qwen-plus", | |||
| "embedding_model": "text-embedding-v2", | |||
| "image2text_model": "qwen-vl-max", | |||
| @@ -64,7 +64,7 @@ default_llm = { | |||
| "image2text_model": "gpt-4-vision-preview", | |||
| "asr_model": "whisper-1", | |||
| }, | |||
| "智谱AI": { | |||
| "ZHIPU-AI": { | |||
| "chat_model": "glm-3-turbo", | |||
| "embedding_model": "embedding-2", | |||
| "image2text_model": "glm-4v", | |||
| @@ -84,17 +84,17 @@ default_llm = { | |||
| } | |||
| } | |||
| LLM = get_base_config("user_default_llm", {}) | |||
| LLM_FACTORY = LLM.get("factory", "通义千问") | |||
| LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen") | |||
| if LLM_FACTORY not in default_llm: | |||
| print("\33[91m【ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to '通义千问/QWen' automatically, and please check the API_KEY in service_conf.yaml.") | |||
| LLM_FACTORY = "通义千问" | |||
| print("\33[91m【ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to 'Tongyi-Qianwen/QWen' automatically, and please check the API_KEY in service_conf.yaml.") | |||
| LLM_FACTORY = "Tongyi-Qianwen" | |||
| CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"] | |||
| EMBEDDING_MDL = default_llm[LLM_FACTORY]["embedding_model"] | |||
| ASR_MDL = default_llm[LLM_FACTORY]["asr_model"] | |||
| IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] | |||
| API_KEY = LLM.get("api_key", "") | |||
| PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") | |||
| PARSERS = LLM.get("parsers", "naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") | |||
| # distribution | |||
| DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) | |||
| @@ -57,7 +57,7 @@ class Pdf(PdfParser): | |||
| sec_ids = [] | |||
| sid = 0 | |||
| for i, lvl in enumerate(levels): | |||
| if lvl <= most_level: sid += 1 | |||
| if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1 | |||
| sec_ids.append(sid) | |||
| #print(lvl, self.boxes[i]["text"], most_level) | |||
| @@ -75,7 +75,7 @@ class Pdf(PdfParser): | |||
| continue | |||
| chunks.append(txt + poss) | |||
| if sec_id >-1: last_sid = sec_id | |||
| return chunks | |||
| return chunks, tbls | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||
| @@ -86,7 +86,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca | |||
| if re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() | |||
| cks = pdf_parser(filename if not binary else binary, | |||
| cks, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| else: raise NotImplementedError("file type not supported yet(pdf supported)") | |||
| doc = { | |||
| @@ -100,7 +100,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca | |||
| i = 0 | |||
| chunk = [] | |||
| tk_cnt = 0 | |||
| res = [] | |||
| res = tokenize_table(tbls, doc, eng) | |||
| def add_chunk(): | |||
| nonlocal chunk, res, doc, pdf_parser, tk_cnt | |||
| d = copy.deepcopy(doc) | |||
| @@ -49,7 +49,7 @@ class Pdf(PdfParser): | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, txt. | |||
| Supported file formats are docx, pdf, excel, txt. | |||
| This method apply the naive ways to chunk files. | |||
| Successive text will be sliced into pieces using 'delimiter'. | |||
| Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. | |||
| @@ -0,0 +1,108 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from rag.app import laws | |||
| from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions | |||
| from deepdoc.parser import PdfParser, ExcelParser | |||
| from rag.settings import cron_logger | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.63, "Layout analysis finished.") | |||
| print("paddle layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.65, "Table analysis finished.") | |||
| self._text_merge() | |||
| callback(0.67, "Text merging finished") | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| self._concat_downward() | |||
| sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)] | |||
| for (img, rows), poss in tbls: | |||
| sections.append((rows if isinstance(rows, str) else rows[0], | |||
| [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) | |||
| return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))] | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, excel, txt. | |||
| One file forms a chunk which maintains original text order. | |||
| """ | |||
| eng = lang.lower() == "english"#is_english(cks) | |||
| sections = [] | |||
| if re.search(r"\.docx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| for txt in laws.Docx()(filename, binary): | |||
| sections.append(txt) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() | |||
| sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback) | |||
| elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = ExcelParser() | |||
| sections = [excel_parser.html(binary)] | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| txt = binary.decode("utf-8") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: break | |||
| txt += l | |||
| sections = txt.split("\n") | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||
| tokenize(doc, "\n".join(sections), eng) | |||
| return [doc] | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) | |||
| @@ -21,8 +21,8 @@ from .cv_model import * | |||
| EmbeddingModel = { | |||
| "Local": HuEmbedding, | |||
| "OpenAI": OpenAIEmbed, | |||
| "通义千问": HuEmbedding, #QWenEmbed, | |||
| "智谱AI": ZhipuEmbed, | |||
| "Tongyi-Qianwen": HuEmbedding, #QWenEmbed, | |||
| "ZHIPU-AI": ZhipuEmbed, | |||
| "Moonshot": HuEmbedding | |||
| } | |||
| @@ -30,16 +30,16 @@ EmbeddingModel = { | |||
| CvModel = { | |||
| "OpenAI": GptV4, | |||
| "Local": LocalCV, | |||
| "通义千问": QWenCV, | |||
| "智谱AI": Zhipu4V, | |||
| "Tongyi-Qianwen": QWenCV, | |||
| "ZHIPU-AI": Zhipu4V, | |||
| "Moonshot": LocalCV | |||
| } | |||
| ChatModel = { | |||
| "OpenAI": GptTurbo, | |||
| "智谱AI": ZhipuChat, | |||
| "通义千问": QWenChat, | |||
| "ZHIPU-AI": ZhipuChat, | |||
| "Tongyi-Qianwen": QWenChat, | |||
| "Local": LocalLLM, | |||
| "Moonshot": MoonshotChat | |||
| } | |||
| @@ -194,7 +194,7 @@ class Dealer: | |||
| return [float(t) for t in txt.split("\t")] | |||
| def insert_citations(self, answer, chunks, chunk_v, | |||
| embd_mdl, tkweight=0.7, vtweight=0.3): | |||
| embd_mdl, tkweight=0.1, vtweight=0.9): | |||
| assert len(chunks) == len(chunk_v) | |||
| pieces = re.split(r"(```)", answer) | |||
| if len(pieces) >= 3: | |||
| @@ -243,7 +243,7 @@ class Dealer: | |||
| chunks_tks, | |||
| tkweight, vtweight) | |||
| mx = np.max(sim) * 0.99 | |||
| if mx < 0.7: | |||
| if mx < 0.65: | |||
| continue | |||
| cites[idx[i]] = list( | |||
| set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4] | |||
| @@ -84,6 +84,7 @@ def dispatch(): | |||
| pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) | |||
| page_size = 5 | |||
| if r["parser_id"] == "paper": page_size = 12 | |||
| if r["parser_id"] == "one": page_size = 1000000000 | |||
| for s,e in r["parser_config"].get("pages", [(0,100000)]): | |||
| e = min(e, pages) | |||
| for p in range(s, e, page_size): | |||
| @@ -39,7 +39,7 @@ from rag.nlp import search | |||
| from io import BytesIO | |||
| import pandas as pd | |||
| from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive | |||
| from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one | |||
| from api.db import LLMType, ParserType | |||
| from api.db.services.document_service import DocumentService | |||
| @@ -60,6 +60,7 @@ FACTORY = { | |||
| ParserType.TABLE.value: table, | |||
| ParserType.RESUME.value: resume, | |||
| ParserType.PICTURE.value: picture, | |||
| ParserType.ONE.value: one, | |||
| } | |||