| > In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_. | > In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_. | ||||
| > It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system. | > It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system. | ||||
| > - We have supported the flowing LLM factory, and the others is coming soon: | > - We have supported the flowing LLM factory, and the others is coming soon: | ||||
| > [OpenAI](https://platform.openai.com/login?launch), [通义千问/QWen](https://dashscope.console.aliyun.com/model), | |||||
| > [智谱AI/ZhipuAI](https://open.bigmodel.cn/) | |||||
| > [OpenAI](https://platform.openai.com/login?launch), [Tongyi-Qianwen](https://dashscope.console.aliyun.com/model), | |||||
| > [ZHIPU-AI](https://open.bigmodel.cn/), [Moonshot](https://platform.moonshot.cn/docs/docs) | |||||
| ```bash | ```bash | ||||
| 121:/# git clone https://github.com/infiniflow/ragflow.git | 121:/# git clone https://github.com/infiniflow/ragflow.git | ||||
| 121:/# cd ragflow/docker | 121:/# cd ragflow/docker |
| TABLE = "table" | TABLE = "table" | ||||
| NAIVE = "naive" | NAIVE = "naive" | ||||
| PICTURE = "picture" | PICTURE = "picture" | ||||
| ONE = "one" |
| "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | ||||
| "status": "1", | "status": "1", | ||||
| },{ | },{ | ||||
| "name": "通义千问", | |||||
| "name": "Tongyi-Qianwen", | |||||
| "logo": "", | "logo": "", | ||||
| "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | ||||
| "status": "1", | "status": "1", | ||||
| },{ | },{ | ||||
| "name": "智谱AI", | |||||
| "name": "ZHIPU-AI", | |||||
| "logo": "", | "logo": "", | ||||
| "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", | ||||
| "status": "1", | "status": "1", | ||||
| except Exception as e: | except Exception as e: | ||||
| pass | pass | ||||
| """ | |||||
| drop table llm; | |||||
| drop table factories; | |||||
| update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问'; | |||||
| update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI'; | |||||
| update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture'; | |||||
| """ | |||||
| def init_web_data(): | def init_web_data(): | ||||
| start_time = time.time() | start_time = time.time() |
| USE_REGISTRY = get_base_config("use_registry") | USE_REGISTRY = get_base_config("use_registry") | ||||
| default_llm = { | default_llm = { | ||||
| "通义千问": { | |||||
| "Tongyi-Qianwen": { | |||||
| "chat_model": "qwen-plus", | "chat_model": "qwen-plus", | ||||
| "embedding_model": "text-embedding-v2", | "embedding_model": "text-embedding-v2", | ||||
| "image2text_model": "qwen-vl-max", | "image2text_model": "qwen-vl-max", | ||||
| "image2text_model": "gpt-4-vision-preview", | "image2text_model": "gpt-4-vision-preview", | ||||
| "asr_model": "whisper-1", | "asr_model": "whisper-1", | ||||
| }, | }, | ||||
| "智谱AI": { | |||||
| "ZHIPU-AI": { | |||||
| "chat_model": "glm-3-turbo", | "chat_model": "glm-3-turbo", | ||||
| "embedding_model": "embedding-2", | "embedding_model": "embedding-2", | ||||
| "image2text_model": "glm-4v", | "image2text_model": "glm-4v", | ||||
| } | } | ||||
| } | } | ||||
| LLM = get_base_config("user_default_llm", {}) | LLM = get_base_config("user_default_llm", {}) | ||||
| LLM_FACTORY = LLM.get("factory", "通义千问") | |||||
| LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen") | |||||
| if LLM_FACTORY not in default_llm: | if LLM_FACTORY not in default_llm: | ||||
| print("\33[91m【ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to '通义千问/QWen' automatically, and please check the API_KEY in service_conf.yaml.") | |||||
| LLM_FACTORY = "通义千问" | |||||
| print("\33[91m【ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to 'Tongyi-Qianwen/QWen' automatically, and please check the API_KEY in service_conf.yaml.") | |||||
| LLM_FACTORY = "Tongyi-Qianwen" | |||||
| CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"] | CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"] | ||||
| EMBEDDING_MDL = default_llm[LLM_FACTORY]["embedding_model"] | EMBEDDING_MDL = default_llm[LLM_FACTORY]["embedding_model"] | ||||
| ASR_MDL = default_llm[LLM_FACTORY]["asr_model"] | ASR_MDL = default_llm[LLM_FACTORY]["asr_model"] | ||||
| IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] | IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] | ||||
| API_KEY = LLM.get("api_key", "") | API_KEY = LLM.get("api_key", "") | ||||
| PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") | |||||
| PARSERS = LLM.get("parsers", "naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") | |||||
| # distribution | # distribution | ||||
| DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) | DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) |
| sec_ids = [] | sec_ids = [] | ||||
| sid = 0 | sid = 0 | ||||
| for i, lvl in enumerate(levels): | for i, lvl in enumerate(levels): | ||||
| if lvl <= most_level: sid += 1 | |||||
| if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1 | |||||
| sec_ids.append(sid) | sec_ids.append(sid) | ||||
| #print(lvl, self.boxes[i]["text"], most_level) | #print(lvl, self.boxes[i]["text"], most_level) | ||||
| continue | continue | ||||
| chunks.append(txt + poss) | chunks.append(txt + poss) | ||||
| if sec_id >-1: last_sid = sec_id | if sec_id >-1: last_sid = sec_id | ||||
| return chunks | |||||
| return chunks, tbls | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | ||||
| if re.search(r"\.pdf$", filename, re.IGNORECASE): | if re.search(r"\.pdf$", filename, re.IGNORECASE): | ||||
| pdf_parser = Pdf() | pdf_parser = Pdf() | ||||
| cks = pdf_parser(filename if not binary else binary, | |||||
| cks, tbls = pdf_parser(filename if not binary else binary, | |||||
| from_page=from_page, to_page=to_page, callback=callback) | from_page=from_page, to_page=to_page, callback=callback) | ||||
| else: raise NotImplementedError("file type not supported yet(pdf supported)") | else: raise NotImplementedError("file type not supported yet(pdf supported)") | ||||
| doc = { | doc = { | ||||
| i = 0 | i = 0 | ||||
| chunk = [] | chunk = [] | ||||
| tk_cnt = 0 | tk_cnt = 0 | ||||
| res = [] | |||||
| res = tokenize_table(tbls, doc, eng) | |||||
| def add_chunk(): | def add_chunk(): | ||||
| nonlocal chunk, res, doc, pdf_parser, tk_cnt | nonlocal chunk, res, doc, pdf_parser, tk_cnt | ||||
| d = copy.deepcopy(doc) | d = copy.deepcopy(doc) |
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | ||||
| """ | """ | ||||
| Supported file formats are docx, pdf, txt. | |||||
| Supported file formats are docx, pdf, excel, txt. | |||||
| This method apply the naive ways to chunk files. | This method apply the naive ways to chunk files. | ||||
| Successive text will be sliced into pieces using 'delimiter'. | Successive text will be sliced into pieces using 'delimiter'. | ||||
| Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. | Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import copy | |||||
| import re | |||||
| from rag.app import laws | |||||
| from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions | |||||
| from deepdoc.parser import PdfParser, ExcelParser | |||||
| from rag.settings import cron_logger | |||||
| class Pdf(PdfParser): | |||||
| def __call__(self, filename, binary=None, from_page=0, | |||||
| to_page=100000, zoomin=3, callback=None): | |||||
| callback(msg="OCR is running...") | |||||
| self.__images__( | |||||
| filename if not binary else binary, | |||||
| zoomin, | |||||
| from_page, | |||||
| to_page, | |||||
| callback | |||||
| ) | |||||
| callback(msg="OCR finished") | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | |||||
| self._layouts_rec(zoomin) | |||||
| callback(0.63, "Layout analysis finished.") | |||||
| print("paddle layouts:", timer() - start) | |||||
| self._table_transformer_job(zoomin) | |||||
| callback(0.65, "Table analysis finished.") | |||||
| self._text_merge() | |||||
| callback(0.67, "Text merging finished") | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||||
| self._concat_downward() | |||||
| sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)] | |||||
| for (img, rows), poss in tbls: | |||||
| sections.append((rows if isinstance(rows, str) else rows[0], | |||||
| [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) | |||||
| return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))] | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||||
| """ | |||||
| Supported file formats are docx, pdf, excel, txt. | |||||
| One file forms a chunk which maintains original text order. | |||||
| """ | |||||
| eng = lang.lower() == "english"#is_english(cks) | |||||
| sections = [] | |||||
| if re.search(r"\.docx?$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| for txt in laws.Docx()(filename, binary): | |||||
| sections.append(txt) | |||||
| callback(0.8, "Finish parsing.") | |||||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||||
| pdf_parser = Pdf() | |||||
| sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback) | |||||
| elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| excel_parser = ExcelParser() | |||||
| sections = [excel_parser.html(binary)] | |||||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| txt = "" | |||||
| if binary: | |||||
| txt = binary.decode("utf-8") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: break | |||||
| txt += l | |||||
| sections = txt.split("\n") | |||||
| sections = [(l, "") for l in sections if l] | |||||
| callback(0.8, "Finish parsing.") | |||||
| else: | |||||
| raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") | |||||
| doc = { | |||||
| "docnm_kwd": filename, | |||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | |||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| tokenize(doc, "\n".join(sections), eng) | |||||
| return [doc] | |||||
| if __name__ == "__main__": | |||||
| import sys | |||||
| def dummy(prog=None, msg=""): | |||||
| pass | |||||
| chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) |
| EmbeddingModel = { | EmbeddingModel = { | ||||
| "Local": HuEmbedding, | "Local": HuEmbedding, | ||||
| "OpenAI": OpenAIEmbed, | "OpenAI": OpenAIEmbed, | ||||
| "通义千问": HuEmbedding, #QWenEmbed, | |||||
| "智谱AI": ZhipuEmbed, | |||||
| "Tongyi-Qianwen": HuEmbedding, #QWenEmbed, | |||||
| "ZHIPU-AI": ZhipuEmbed, | |||||
| "Moonshot": HuEmbedding | "Moonshot": HuEmbedding | ||||
| } | } | ||||
| CvModel = { | CvModel = { | ||||
| "OpenAI": GptV4, | "OpenAI": GptV4, | ||||
| "Local": LocalCV, | "Local": LocalCV, | ||||
| "通义千问": QWenCV, | |||||
| "智谱AI": Zhipu4V, | |||||
| "Tongyi-Qianwen": QWenCV, | |||||
| "ZHIPU-AI": Zhipu4V, | |||||
| "Moonshot": LocalCV | "Moonshot": LocalCV | ||||
| } | } | ||||
| ChatModel = { | ChatModel = { | ||||
| "OpenAI": GptTurbo, | "OpenAI": GptTurbo, | ||||
| "智谱AI": ZhipuChat, | |||||
| "通义千问": QWenChat, | |||||
| "ZHIPU-AI": ZhipuChat, | |||||
| "Tongyi-Qianwen": QWenChat, | |||||
| "Local": LocalLLM, | "Local": LocalLLM, | ||||
| "Moonshot": MoonshotChat | "Moonshot": MoonshotChat | ||||
| } | } |
| return [float(t) for t in txt.split("\t")] | return [float(t) for t in txt.split("\t")] | ||||
| def insert_citations(self, answer, chunks, chunk_v, | def insert_citations(self, answer, chunks, chunk_v, | ||||
| embd_mdl, tkweight=0.7, vtweight=0.3): | |||||
| embd_mdl, tkweight=0.1, vtweight=0.9): | |||||
| assert len(chunks) == len(chunk_v) | assert len(chunks) == len(chunk_v) | ||||
| pieces = re.split(r"(```)", answer) | pieces = re.split(r"(```)", answer) | ||||
| if len(pieces) >= 3: | if len(pieces) >= 3: | ||||
| chunks_tks, | chunks_tks, | ||||
| tkweight, vtweight) | tkweight, vtweight) | ||||
| mx = np.max(sim) * 0.99 | mx = np.max(sim) * 0.99 | ||||
| if mx < 0.7: | |||||
| if mx < 0.65: | |||||
| continue | continue | ||||
| cites[idx[i]] = list( | cites[idx[i]] = list( | ||||
| set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4] | set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4] |
| pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) | pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) | ||||
| page_size = 5 | page_size = 5 | ||||
| if r["parser_id"] == "paper": page_size = 12 | if r["parser_id"] == "paper": page_size = 12 | ||||
| if r["parser_id"] == "one": page_size = 1000000000 | |||||
| for s,e in r["parser_config"].get("pages", [(0,100000)]): | for s,e in r["parser_config"].get("pages", [(0,100000)]): | ||||
| e = min(e, pages) | e = min(e, pages) | ||||
| for p in range(s, e, page_size): | for p in range(s, e, page_size): |
| from io import BytesIO | from io import BytesIO | ||||
| import pandas as pd | import pandas as pd | ||||
| from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive | |||||
| from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one | |||||
| from api.db import LLMType, ParserType | from api.db import LLMType, ParserType | ||||
| from api.db.services.document_service import DocumentService | from api.db.services.document_service import DocumentService | ||||
| ParserType.TABLE.value: table, | ParserType.TABLE.value: table, | ||||
| ParserType.RESUME.value: resume, | ParserType.RESUME.value: resume, | ||||
| ParserType.PICTURE.value: picture, | ParserType.PICTURE.value: picture, | ||||
| ParserType.ONE.value: one, | |||||
| } | } | ||||