Просмотр исходного кода

Add 'One' chunk method (#137)

tags/v0.1.0
KevinHuSh 1 год назад
Родитель
Сommit
5875c8ba08
Аккаунт пользователя с таким Email не найден
11 измененных файлов: 143 добавлений и 24 удалений
  1. 2
    2
      README.md
  2. 1
    0
      api/db/__init__.py
  3. 10
    2
      api/db/init_data.py
  4. 6
    6
      api/settings.py
  5. 4
    4
      rag/app/manual.py
  6. 1
    1
      rag/app/naive.py
  7. 108
    0
      rag/app/one.py
  8. 6
    6
      rag/llm/__init__.py
  9. 2
    2
      rag/nlp/search.py
  10. 1
    0
      rag/svr/task_broker.py
  11. 2
    1
      rag/svr/task_executor.py

+ 2
- 2
README.md Просмотреть файл

> In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_. > In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_.
> It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system. > It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system.
> - We have supported the flowing LLM factory, and the others is coming soon: > - We have supported the flowing LLM factory, and the others is coming soon:
> [OpenAI](https://platform.openai.com/login?launch), [通义千问/QWen](https://dashscope.console.aliyun.com/model),
> [智谱AI/ZhipuAI](https://open.bigmodel.cn/)
> [OpenAI](https://platform.openai.com/login?launch), [Tongyi-Qianwen](https://dashscope.console.aliyun.com/model),
> [ZHIPU-AI](https://open.bigmodel.cn/), [Moonshot](https://platform.moonshot.cn/docs/docs)
```bash ```bash
121:/# git clone https://github.com/infiniflow/ragflow.git 121:/# git clone https://github.com/infiniflow/ragflow.git
121:/# cd ragflow/docker 121:/# cd ragflow/docker

+ 1
- 0
api/db/__init__.py Просмотреть файл

TABLE = "table" TABLE = "table"
NAIVE = "naive" NAIVE = "naive"
PICTURE = "picture" PICTURE = "picture"
ONE = "one"

+ 10
- 2
api/db/init_data.py Просмотреть файл

"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
"status": "1", "status": "1",
},{ },{
"name": "通义千问",
"name": "Tongyi-Qianwen",
"logo": "", "logo": "",
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
"status": "1", "status": "1",
},{ },{
"name": "智谱AI",
"name": "ZHIPU-AI",
"logo": "", "logo": "",
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
"status": "1", "status": "1",
except Exception as e: except Exception as e:
pass pass
"""
drop table llm;
drop table factories;
update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture';
"""
def init_web_data(): def init_web_data():
start_time = time.time() start_time = time.time()

+ 6
- 6
api/settings.py Просмотреть файл

USE_REGISTRY = get_base_config("use_registry") USE_REGISTRY = get_base_config("use_registry")
default_llm = { default_llm = {
"通义千问": {
"Tongyi-Qianwen": {
"chat_model": "qwen-plus", "chat_model": "qwen-plus",
"embedding_model": "text-embedding-v2", "embedding_model": "text-embedding-v2",
"image2text_model": "qwen-vl-max", "image2text_model": "qwen-vl-max",
"image2text_model": "gpt-4-vision-preview", "image2text_model": "gpt-4-vision-preview",
"asr_model": "whisper-1", "asr_model": "whisper-1",
}, },
"智谱AI": {
"ZHIPU-AI": {
"chat_model": "glm-3-turbo", "chat_model": "glm-3-turbo",
"embedding_model": "embedding-2", "embedding_model": "embedding-2",
"image2text_model": "glm-4v", "image2text_model": "glm-4v",
} }
} }
LLM = get_base_config("user_default_llm", {}) LLM = get_base_config("user_default_llm", {})
LLM_FACTORY = LLM.get("factory", "通义千问")
LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen")
if LLM_FACTORY not in default_llm: if LLM_FACTORY not in default_llm:
print("\33[91m【ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to '通义千问/QWen' automatically, and please check the API_KEY in service_conf.yaml.")
LLM_FACTORY = "通义千问"
print("\33[91m【ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to 'Tongyi-Qianwen/QWen' automatically, and please check the API_KEY in service_conf.yaml.")
LLM_FACTORY = "Tongyi-Qianwen"
CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"] CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"]
EMBEDDING_MDL = default_llm[LLM_FACTORY]["embedding_model"] EMBEDDING_MDL = default_llm[LLM_FACTORY]["embedding_model"]
ASR_MDL = default_llm[LLM_FACTORY]["asr_model"] ASR_MDL = default_llm[LLM_FACTORY]["asr_model"]
IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
API_KEY = LLM.get("api_key", "") API_KEY = LLM.get("api_key", "")
PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture")
PARSERS = LLM.get("parsers", "naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture")
# distribution # distribution
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)

+ 4
- 4
rag/app/manual.py Просмотреть файл

sec_ids = [] sec_ids = []
sid = 0 sid = 0
for i, lvl in enumerate(levels): for i, lvl in enumerate(levels):
if lvl <= most_level: sid += 1
if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
sec_ids.append(sid) sec_ids.append(sid)
#print(lvl, self.boxes[i]["text"], most_level) #print(lvl, self.boxes[i]["text"], most_level)
continue continue
chunks.append(txt + poss) chunks.append(txt + poss)
if sec_id >-1: last_sid = sec_id if sec_id >-1: last_sid = sec_id
return chunks
return chunks, tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
if re.search(r"\.pdf$", filename, re.IGNORECASE): if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() pdf_parser = Pdf()
cks = pdf_parser(filename if not binary else binary,
cks, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback) from_page=from_page, to_page=to_page, callback=callback)
else: raise NotImplementedError("file type not supported yet(pdf supported)") else: raise NotImplementedError("file type not supported yet(pdf supported)")
doc = { doc = {
i = 0 i = 0
chunk = [] chunk = []
tk_cnt = 0 tk_cnt = 0
res = []
res = tokenize_table(tbls, doc, eng)
def add_chunk(): def add_chunk():
nonlocal chunk, res, doc, pdf_parser, tk_cnt nonlocal chunk, res, doc, pdf_parser, tk_cnt
d = copy.deepcopy(doc) d = copy.deepcopy(doc)

+ 1
- 1
rag/app/naive.py Просмотреть файл

def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
""" """
Supported file formats are docx, pdf, txt.
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files. This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'. Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.

+ 108
- 0
rag/app/one.py Просмотреть файл

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import re
from rag.app import laws
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
from deepdoc.parser import PdfParser, ExcelParser
from rag.settings import cron_logger
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
callback(msg="OCR is running...")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished")
from timeit import default_timer as timer
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis finished.")
print("paddle layouts:", timer() - start)
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis finished.")
self._text_merge()
callback(0.67, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls:
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
One file forms a chunk which maintains original text order.
"""
eng = lang.lower() == "english"#is_english(cks)
sections = []
if re.search(r"\.docx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
for txt in laws.Docx()(filename, binary):
sections.append(txt)
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
sections = [excel_parser.html(binary)]
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l: break
txt += l
sections = txt.split("\n")
sections = [(l, "") for l in sections if l]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
tokenize(doc, "\n".join(sections), eng)
return [doc]
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

+ 6
- 6
rag/llm/__init__.py Просмотреть файл

EmbeddingModel = { EmbeddingModel = {
"Local": HuEmbedding, "Local": HuEmbedding,
"OpenAI": OpenAIEmbed, "OpenAI": OpenAIEmbed,
"通义千问": HuEmbedding, #QWenEmbed,
"智谱AI": ZhipuEmbed,
"Tongyi-Qianwen": HuEmbedding, #QWenEmbed,
"ZHIPU-AI": ZhipuEmbed,
"Moonshot": HuEmbedding "Moonshot": HuEmbedding
} }


CvModel = { CvModel = {
"OpenAI": GptV4, "OpenAI": GptV4,
"Local": LocalCV, "Local": LocalCV,
"通义千问": QWenCV,
"智谱AI": Zhipu4V,
"Tongyi-Qianwen": QWenCV,
"ZHIPU-AI": Zhipu4V,
"Moonshot": LocalCV "Moonshot": LocalCV
} }




ChatModel = { ChatModel = {
"OpenAI": GptTurbo, "OpenAI": GptTurbo,
"智谱AI": ZhipuChat,
"通义千问": QWenChat,
"ZHIPU-AI": ZhipuChat,
"Tongyi-Qianwen": QWenChat,
"Local": LocalLLM, "Local": LocalLLM,
"Moonshot": MoonshotChat "Moonshot": MoonshotChat
} }

+ 2
- 2
rag/nlp/search.py Просмотреть файл

return [float(t) for t in txt.split("\t")] return [float(t) for t in txt.split("\t")]


def insert_citations(self, answer, chunks, chunk_v, def insert_citations(self, answer, chunks, chunk_v,
embd_mdl, tkweight=0.7, vtweight=0.3):
embd_mdl, tkweight=0.1, vtweight=0.9):
assert len(chunks) == len(chunk_v) assert len(chunks) == len(chunk_v)
pieces = re.split(r"(```)", answer) pieces = re.split(r"(```)", answer)
if len(pieces) >= 3: if len(pieces) >= 3:
chunks_tks, chunks_tks,
tkweight, vtweight) tkweight, vtweight)
mx = np.max(sim) * 0.99 mx = np.max(sim) * 0.99
if mx < 0.7:
if mx < 0.65:
continue continue
cites[idx[i]] = list( cites[idx[i]] = list(
set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4] set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]

+ 1
- 0
rag/svr/task_broker.py Просмотреть файл

pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
page_size = 5 page_size = 5
if r["parser_id"] == "paper": page_size = 12 if r["parser_id"] == "paper": page_size = 12
if r["parser_id"] == "one": page_size = 1000000000
for s,e in r["parser_config"].get("pages", [(0,100000)]): for s,e in r["parser_config"].get("pages", [(0,100000)]):
e = min(e, pages) e = min(e, pages)
for p in range(s, e, page_size): for p in range(s, e, page_size):

+ 2
- 1
rag/svr/task_executor.py Просмотреть файл

from io import BytesIO from io import BytesIO
import pandas as pd import pandas as pd


from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one


from api.db import LLMType, ParserType from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
ParserType.TABLE.value: table, ParserType.TABLE.value: table,
ParserType.RESUME.value: resume, ParserType.RESUME.value: resume,
ParserType.PICTURE.value: picture, ParserType.PICTURE.value: picture,
ParserType.ONE.value: one,
} }





Загрузка…
Отмена
Сохранить