### What problem does this PR solve? Related source file is in Windows/DOS format, they are format to Unix format. ### Type of change - [x] Refactoring Signed-off-by: Jin Hai <haijin.chn@gmail.com>tags/v0.10.0
| @@ -1,27 +1,27 @@ | |||
| FROM infiniflow/ragflow-base:v2.0 | |||
| USER root | |||
| WORKDIR /ragflow | |||
| ## for cuda > 12.0 | |||
| RUN pip uninstall -y onnxruntime-gpu | |||
| RUN pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ | |||
| ADD ./web ./web | |||
| RUN cd ./web && npm i --force && npm run build | |||
| ADD ./api ./api | |||
| ADD ./conf ./conf | |||
| ADD ./deepdoc ./deepdoc | |||
| ADD ./rag ./rag | |||
| ADD ./agent ./agent | |||
| ADD ./graphrag ./graphrag | |||
| ENV PYTHONPATH=/ragflow/ | |||
| ENV HF_ENDPOINT=https://hf-mirror.com | |||
| ADD docker/entrypoint.sh ./entrypoint.sh | |||
| RUN chmod +x ./entrypoint.sh | |||
| ENTRYPOINT ["./entrypoint.sh"] | |||
| FROM infiniflow/ragflow-base:v2.0 | |||
| USER root | |||
| WORKDIR /ragflow | |||
| ## for cuda > 12.0 | |||
| RUN pip uninstall -y onnxruntime-gpu | |||
| RUN pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ | |||
| ADD ./web ./web | |||
| RUN cd ./web && npm i --force && npm run build | |||
| ADD ./api ./api | |||
| ADD ./conf ./conf | |||
| ADD ./deepdoc ./deepdoc | |||
| ADD ./rag ./rag | |||
| ADD ./agent ./agent | |||
| ADD ./graphrag ./graphrag | |||
| ENV PYTHONPATH=/ragflow/ | |||
| ENV HF_ENDPOINT=https://hf-mirror.com | |||
| ADD docker/entrypoint.sh ./entrypoint.sh | |||
| RUN chmod +x ./entrypoint.sh | |||
| ENTRYPOINT ["./entrypoint.sh"] | |||
| @@ -1,56 +1,56 @@ | |||
| FROM ubuntu:22.04 | |||
| USER root | |||
| WORKDIR /ragflow | |||
| RUN apt-get update && apt-get install -y wget curl build-essential libopenmpi-dev | |||
| RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ | |||
| bash ~/miniconda.sh -b -p /root/miniconda3 && \ | |||
| rm ~/miniconda.sh && ln -s /root/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ | |||
| echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc && \ | |||
| echo "conda activate base" >> ~/.bashrc | |||
| ENV PATH /root/miniconda3/bin:$PATH | |||
| RUN conda create -y --name py11 python=3.11 | |||
| ENV CONDA_DEFAULT_ENV py11 | |||
| ENV CONDA_PREFIX /root/miniconda3/envs/py11 | |||
| ENV PATH $CONDA_PREFIX/bin:$PATH | |||
| RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - | |||
| RUN apt-get install -y nodejs | |||
| RUN apt-get install -y nginx | |||
| ADD ./web ./web | |||
| ADD ./api ./api | |||
| ADD ./conf ./conf | |||
| ADD ./deepdoc ./deepdoc | |||
| ADD ./rag ./rag | |||
| ADD ./requirements.txt ./requirements.txt | |||
| ADD ./agent ./agent | |||
| ADD ./graphrag ./graphrag | |||
| RUN apt install openmpi-bin openmpi-common libopenmpi-dev | |||
| ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH | |||
| RUN rm /root/miniconda3/envs/py11/compiler_compat/ld | |||
| RUN cd ./web && npm i --force && npm run build | |||
| RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt | |||
| RUN apt-get update && \ | |||
| apt-get install -y libglib2.0-0 libgl1-mesa-glx && \ | |||
| rm -rf /var/lib/apt/lists/* | |||
| RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ ollama | |||
| RUN conda run -n py11 python -m nltk.downloader punkt | |||
| RUN conda run -n py11 python -m nltk.downloader wordnet | |||
| ENV PYTHONPATH=/ragflow/ | |||
| ENV HF_ENDPOINT=https://hf-mirror.com | |||
| ADD docker/entrypoint.sh ./entrypoint.sh | |||
| RUN chmod +x ./entrypoint.sh | |||
| ENTRYPOINT ["./entrypoint.sh"] | |||
| FROM ubuntu:22.04 | |||
| USER root | |||
| WORKDIR /ragflow | |||
| RUN apt-get update && apt-get install -y wget curl build-essential libopenmpi-dev | |||
| RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ | |||
| bash ~/miniconda.sh -b -p /root/miniconda3 && \ | |||
| rm ~/miniconda.sh && ln -s /root/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ | |||
| echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc && \ | |||
| echo "conda activate base" >> ~/.bashrc | |||
| ENV PATH /root/miniconda3/bin:$PATH | |||
| RUN conda create -y --name py11 python=3.11 | |||
| ENV CONDA_DEFAULT_ENV py11 | |||
| ENV CONDA_PREFIX /root/miniconda3/envs/py11 | |||
| ENV PATH $CONDA_PREFIX/bin:$PATH | |||
| RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - | |||
| RUN apt-get install -y nodejs | |||
| RUN apt-get install -y nginx | |||
| ADD ./web ./web | |||
| ADD ./api ./api | |||
| ADD ./conf ./conf | |||
| ADD ./deepdoc ./deepdoc | |||
| ADD ./rag ./rag | |||
| ADD ./requirements.txt ./requirements.txt | |||
| ADD ./agent ./agent | |||
| ADD ./graphrag ./graphrag | |||
| RUN apt install openmpi-bin openmpi-common libopenmpi-dev | |||
| ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH | |||
| RUN rm /root/miniconda3/envs/py11/compiler_compat/ld | |||
| RUN cd ./web && npm i --force && npm run build | |||
| RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt | |||
| RUN apt-get update && \ | |||
| apt-get install -y libglib2.0-0 libgl1-mesa-glx && \ | |||
| rm -rf /var/lib/apt/lists/* | |||
| RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ ollama | |||
| RUN conda run -n py11 python -m nltk.downloader punkt | |||
| RUN conda run -n py11 python -m nltk.downloader wordnet | |||
| ENV PYTHONPATH=/ragflow/ | |||
| ENV HF_ENDPOINT=https://hf-mirror.com | |||
| ADD docker/entrypoint.sh ./entrypoint.sh | |||
| RUN chmod +x ./entrypoint.sh | |||
| ENTRYPOINT ["./entrypoint.sh"] | |||
| @@ -1,58 +1,58 @@ | |||
| FROM opencloudos/opencloudos:9.0 | |||
| USER root | |||
| WORKDIR /ragflow | |||
| RUN dnf update -y && dnf install -y wget curl gcc-c++ openmpi-devel | |||
| RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ | |||
| bash ~/miniconda.sh -b -p /root/miniconda3 && \ | |||
| rm ~/miniconda.sh && ln -s /root/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ | |||
| echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc && \ | |||
| echo "conda activate base" >> ~/.bashrc | |||
| ENV PATH /root/miniconda3/bin:$PATH | |||
| RUN conda create -y --name py11 python=3.11 | |||
| ENV CONDA_DEFAULT_ENV py11 | |||
| ENV CONDA_PREFIX /root/miniconda3/envs/py11 | |||
| ENV PATH $CONDA_PREFIX/bin:$PATH | |||
| # RUN curl -sL https://rpm.nodesource.com/setup_14.x | bash - | |||
| RUN dnf install -y nodejs | |||
| RUN dnf install -y nginx | |||
| ADD ./web ./web | |||
| ADD ./api ./api | |||
| ADD ./conf ./conf | |||
| ADD ./deepdoc ./deepdoc | |||
| ADD ./rag ./rag | |||
| ADD ./requirements.txt ./requirements.txt | |||
| ADD ./agent ./agent | |||
| ADD ./graphrag ./graphrag | |||
| RUN dnf install -y openmpi openmpi-devel python3-openmpi | |||
| ENV C_INCLUDE_PATH /usr/include/openmpi-x86_64:$C_INCLUDE_PATH | |||
| ENV LD_LIBRARY_PATH /usr/lib64/openmpi/lib:$LD_LIBRARY_PATH | |||
| RUN rm /root/miniconda3/envs/py11/compiler_compat/ld | |||
| RUN cd ./web && npm i --force && npm run build | |||
| RUN conda run -n py11 pip install $(grep -ivE "mpi4py" ./requirements.txt) # without mpi4py==3.1.5 | |||
| RUN conda run -n py11 pip install redis | |||
| RUN dnf update -y && \ | |||
| dnf install -y glib2 mesa-libGL && \ | |||
| dnf clean all | |||
| RUN conda run -n py11 pip install ollama | |||
| RUN conda run -n py11 python -m nltk.downloader punkt | |||
| RUN conda run -n py11 python -m nltk.downloader wordnet | |||
| ENV PYTHONPATH=/ragflow/ | |||
| ENV HF_ENDPOINT=https://hf-mirror.com | |||
| ADD docker/entrypoint.sh ./entrypoint.sh | |||
| RUN chmod +x ./entrypoint.sh | |||
| ENTRYPOINT ["./entrypoint.sh"] | |||
| FROM opencloudos/opencloudos:9.0 | |||
| USER root | |||
| WORKDIR /ragflow | |||
| RUN dnf update -y && dnf install -y wget curl gcc-c++ openmpi-devel | |||
| RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ | |||
| bash ~/miniconda.sh -b -p /root/miniconda3 && \ | |||
| rm ~/miniconda.sh && ln -s /root/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ | |||
| echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc && \ | |||
| echo "conda activate base" >> ~/.bashrc | |||
| ENV PATH /root/miniconda3/bin:$PATH | |||
| RUN conda create -y --name py11 python=3.11 | |||
| ENV CONDA_DEFAULT_ENV py11 | |||
| ENV CONDA_PREFIX /root/miniconda3/envs/py11 | |||
| ENV PATH $CONDA_PREFIX/bin:$PATH | |||
| # RUN curl -sL https://rpm.nodesource.com/setup_14.x | bash - | |||
| RUN dnf install -y nodejs | |||
| RUN dnf install -y nginx | |||
| ADD ./web ./web | |||
| ADD ./api ./api | |||
| ADD ./conf ./conf | |||
| ADD ./deepdoc ./deepdoc | |||
| ADD ./rag ./rag | |||
| ADD ./requirements.txt ./requirements.txt | |||
| ADD ./agent ./agent | |||
| ADD ./graphrag ./graphrag | |||
| RUN dnf install -y openmpi openmpi-devel python3-openmpi | |||
| ENV C_INCLUDE_PATH /usr/include/openmpi-x86_64:$C_INCLUDE_PATH | |||
| ENV LD_LIBRARY_PATH /usr/lib64/openmpi/lib:$LD_LIBRARY_PATH | |||
| RUN rm /root/miniconda3/envs/py11/compiler_compat/ld | |||
| RUN cd ./web && npm i --force && npm run build | |||
| RUN conda run -n py11 pip install $(grep -ivE "mpi4py" ./requirements.txt) # without mpi4py==3.1.5 | |||
| RUN conda run -n py11 pip install redis | |||
| RUN dnf update -y && \ | |||
| dnf install -y glib2 mesa-libGL && \ | |||
| dnf clean all | |||
| RUN conda run -n py11 pip install ollama | |||
| RUN conda run -n py11 python -m nltk.downloader punkt | |||
| RUN conda run -n py11 python -m nltk.downloader wordnet | |||
| ENV PYTHONPATH=/ragflow/ | |||
| ENV HF_ENDPOINT=https://hf-mirror.com | |||
| ADD docker/entrypoint.sh ./entrypoint.sh | |||
| RUN chmod +x ./entrypoint.sh | |||
| ENTRYPOINT ["./entrypoint.sh"] | |||
| @@ -1,69 +1,69 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import random | |||
| from abc import ABC | |||
| from functools import partial | |||
| import pandas as pd | |||
| import requests | |||
| import re | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class BaiduParam(ComponentParamBase): | |||
| """ | |||
| Define the Baidu component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 10 | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| class Baidu(ComponentBase, ABC): | |||
| component_name = "Baidu" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return Baidu.be_output("") | |||
| try: | |||
| url = 'https://www.baidu.com/s?wd=' + ans + '&rn=' + str(self._param.top_n) | |||
| headers = { | |||
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'} | |||
| response = requests.get(url=url, headers=headers) | |||
| url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text) | |||
| title_res = re.findall(r"'title': \\\"(.*?)\\\",\\n", response.text) | |||
| body_res = re.findall(r"\"contentText\":\"(.*?)\"", response.text) | |||
| baidu_res = [{"content": re.sub('<em>|</em>', '', '<a href="' + url + '">' + title + '</a> ' + body)} for | |||
| url, title, body in zip(url_res, title_res, body_res)] | |||
| del body_res, url_res, title_res | |||
| except Exception as e: | |||
| return Baidu.be_output("**ERROR**: " + str(e)) | |||
| if not baidu_res: | |||
| return Baidu.be_output("") | |||
| df = pd.DataFrame(baidu_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import random | |||
| from abc import ABC | |||
| from functools import partial | |||
| import pandas as pd | |||
| import requests | |||
| import re | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class BaiduParam(ComponentParamBase): | |||
| """ | |||
| Define the Baidu component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 10 | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| class Baidu(ComponentBase, ABC): | |||
| component_name = "Baidu" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return Baidu.be_output("") | |||
| try: | |||
| url = 'https://www.baidu.com/s?wd=' + ans + '&rn=' + str(self._param.top_n) | |||
| headers = { | |||
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'} | |||
| response = requests.get(url=url, headers=headers) | |||
| url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text) | |||
| title_res = re.findall(r"'title': \\\"(.*?)\\\",\\n", response.text) | |||
| body_res = re.findall(r"\"contentText\":\"(.*?)\"", response.text) | |||
| baidu_res = [{"content": re.sub('<em>|</em>', '', '<a href="' + url + '">' + title + '</a> ' + body)} for | |||
| url, title, body in zip(url_res, title_res, body_res)] | |||
| del body_res, url_res, title_res | |||
| except Exception as e: | |||
| return Baidu.be_output("**ERROR**: " + str(e)) | |||
| if not baidu_res: | |||
| return Baidu.be_output("") | |||
| df = pd.DataFrame(baidu_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| @@ -1,99 +1,99 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import random | |||
| from abc import ABC | |||
| import requests | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| from hashlib import md5 | |||
| class BaiduFanyiParam(ComponentParamBase): | |||
| """ | |||
| Define the BaiduFanyi component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.appid = "xxx" | |||
| self.secret_key = "xxx" | |||
| self.trans_type = 'translate' | |||
| self.parameters = [] | |||
| self.source_lang = 'auto' | |||
| self.target_lang = 'auto' | |||
| self.domain = 'finance' | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_empty(self.appid, "BaiduFanyi APPID") | |||
| self.check_empty(self.secret_key, "BaiduFanyi Secret Key") | |||
| self.check_valid_value(self.trans_type, "Translate type", ['translate', 'fieldtranslate']) | |||
| self.check_valid_value(self.trans_type, "Translate domain", | |||
| ['it', 'finance', 'machinery', 'senimed', 'novel', 'academic', 'aerospace', 'wiki', | |||
| 'news', 'law', 'contract']) | |||
| self.check_valid_value(self.source_lang, "Source language", | |||
| ['auto', 'zh', 'en', 'yue', 'wyw', 'jp', 'kor', 'fra', 'spa', 'th', 'ara', 'ru', 'pt', | |||
| 'de', 'it', 'el', 'nl', 'pl', 'bul', 'est', 'dan', 'fin', 'cs', 'rom', 'slo', 'swe', | |||
| 'hu', 'cht', 'vie']) | |||
| self.check_valid_value(self.target_lang, "Target language", | |||
| ['auto', 'zh', 'en', 'yue', 'wyw', 'jp', 'kor', 'fra', 'spa', 'th', 'ara', 'ru', 'pt', | |||
| 'de', 'it', 'el', 'nl', 'pl', 'bul', 'est', 'dan', 'fin', 'cs', 'rom', 'slo', 'swe', | |||
| 'hu', 'cht', 'vie']) | |||
| self.check_valid_value(self.domain, "Translate field", | |||
| ['it', 'finance', 'machinery', 'senimed', 'novel', 'academic', 'aerospace', 'wiki', | |||
| 'news', 'law', 'contract']) | |||
| class BaiduFanyi(ComponentBase, ABC): | |||
| component_name = "BaiduFanyi" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return BaiduFanyi.be_output("") | |||
| try: | |||
| source_lang = self._param.source_lang | |||
| target_lang = self._param.target_lang | |||
| appid = self._param.appid | |||
| salt = random.randint(32768, 65536) | |||
| secret_key = self._param.secret_key | |||
| if self._param.trans_type == 'translate': | |||
| sign = md5((appid + ans + salt + secret_key).encode('utf-8')).hexdigest() | |||
| url = 'http://api.fanyi.baidu.com/api/trans/vip/translate?' + 'q=' + ans + '&from=' + source_lang + '&to=' + target_lang + '&appid=' + appid + '&salt=' + salt + '&sign=' + sign | |||
| headers = {"Content-Type": "application/x-www-form-urlencoded"} | |||
| response = requests.post(url=url, headers=headers).json() | |||
| if response.get('error_code'): | |||
| BaiduFanyi.be_output("**Error**:" + response['error_msg']) | |||
| return BaiduFanyi.be_output(response['trans_result'][0]['dst']) | |||
| elif self._param.trans_type == 'fieldtranslate': | |||
| domain = self._param.domain | |||
| sign = md5((appid + ans + salt + domain + secret_key).encode('utf-8')).hexdigest() | |||
| url = 'http://api.fanyi.baidu.com/api/trans/vip/fieldtranslate?' + 'q=' + ans + '&from=' + source_lang + '&to=' + target_lang + '&appid=' + appid + '&salt=' + salt + '&domain=' + domain + '&sign=' + sign | |||
| headers = {"Content-Type": "application/x-www-form-urlencoded"} | |||
| response = requests.post(url=url, headers=headers).json() | |||
| if response.get('error_code'): | |||
| BaiduFanyi.be_output("**Error**:" + response['error_msg']) | |||
| return BaiduFanyi.be_output(response['trans_result'][0]['dst']) | |||
| except Exception as e: | |||
| BaiduFanyi.be_output("**Error**:" + str(e)) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import random | |||
| from abc import ABC | |||
| import requests | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| from hashlib import md5 | |||
| class BaiduFanyiParam(ComponentParamBase): | |||
| """ | |||
| Define the BaiduFanyi component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.appid = "xxx" | |||
| self.secret_key = "xxx" | |||
| self.trans_type = 'translate' | |||
| self.parameters = [] | |||
| self.source_lang = 'auto' | |||
| self.target_lang = 'auto' | |||
| self.domain = 'finance' | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_empty(self.appid, "BaiduFanyi APPID") | |||
| self.check_empty(self.secret_key, "BaiduFanyi Secret Key") | |||
| self.check_valid_value(self.trans_type, "Translate type", ['translate', 'fieldtranslate']) | |||
| self.check_valid_value(self.trans_type, "Translate domain", | |||
| ['it', 'finance', 'machinery', 'senimed', 'novel', 'academic', 'aerospace', 'wiki', | |||
| 'news', 'law', 'contract']) | |||
| self.check_valid_value(self.source_lang, "Source language", | |||
| ['auto', 'zh', 'en', 'yue', 'wyw', 'jp', 'kor', 'fra', 'spa', 'th', 'ara', 'ru', 'pt', | |||
| 'de', 'it', 'el', 'nl', 'pl', 'bul', 'est', 'dan', 'fin', 'cs', 'rom', 'slo', 'swe', | |||
| 'hu', 'cht', 'vie']) | |||
| self.check_valid_value(self.target_lang, "Target language", | |||
| ['auto', 'zh', 'en', 'yue', 'wyw', 'jp', 'kor', 'fra', 'spa', 'th', 'ara', 'ru', 'pt', | |||
| 'de', 'it', 'el', 'nl', 'pl', 'bul', 'est', 'dan', 'fin', 'cs', 'rom', 'slo', 'swe', | |||
| 'hu', 'cht', 'vie']) | |||
| self.check_valid_value(self.domain, "Translate field", | |||
| ['it', 'finance', 'machinery', 'senimed', 'novel', 'academic', 'aerospace', 'wiki', | |||
| 'news', 'law', 'contract']) | |||
| class BaiduFanyi(ComponentBase, ABC): | |||
| component_name = "BaiduFanyi" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return BaiduFanyi.be_output("") | |||
| try: | |||
| source_lang = self._param.source_lang | |||
| target_lang = self._param.target_lang | |||
| appid = self._param.appid | |||
| salt = random.randint(32768, 65536) | |||
| secret_key = self._param.secret_key | |||
| if self._param.trans_type == 'translate': | |||
| sign = md5((appid + ans + salt + secret_key).encode('utf-8')).hexdigest() | |||
| url = 'http://api.fanyi.baidu.com/api/trans/vip/translate?' + 'q=' + ans + '&from=' + source_lang + '&to=' + target_lang + '&appid=' + appid + '&salt=' + salt + '&sign=' + sign | |||
| headers = {"Content-Type": "application/x-www-form-urlencoded"} | |||
| response = requests.post(url=url, headers=headers).json() | |||
| if response.get('error_code'): | |||
| BaiduFanyi.be_output("**Error**:" + response['error_msg']) | |||
| return BaiduFanyi.be_output(response['trans_result'][0]['dst']) | |||
| elif self._param.trans_type == 'fieldtranslate': | |||
| domain = self._param.domain | |||
| sign = md5((appid + ans + salt + domain + secret_key).encode('utf-8')).hexdigest() | |||
| url = 'http://api.fanyi.baidu.com/api/trans/vip/fieldtranslate?' + 'q=' + ans + '&from=' + source_lang + '&to=' + target_lang + '&appid=' + appid + '&salt=' + salt + '&domain=' + domain + '&sign=' + sign | |||
| headers = {"Content-Type": "application/x-www-form-urlencoded"} | |||
| response = requests.post(url=url, headers=headers).json() | |||
| if response.get('error_code'): | |||
| BaiduFanyi.be_output("**Error**:" + response['error_msg']) | |||
| return BaiduFanyi.be_output(response['trans_result'][0]['dst']) | |||
| except Exception as e: | |||
| BaiduFanyi.be_output("**Error**:" + str(e)) | |||
| @@ -1,85 +1,85 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import requests | |||
| import pandas as pd | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class BingParam(ComponentParamBase): | |||
| """ | |||
| Define the Bing component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 10 | |||
| self.channel = "Webpages" | |||
| self.api_key = "YOUR_ACCESS_KEY" | |||
| self.country = "CN" | |||
| self.language = "en" | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_valid_value(self.channel, "Bing Web Search or Bing News", ["Webpages", "News"]) | |||
| self.check_empty(self.api_key, "Bing subscription key") | |||
| self.check_valid_value(self.country, "Bing Country", | |||
| ['AR', 'AU', 'AT', 'BE', 'BR', 'CA', 'CL', 'DK', 'FI', 'FR', 'DE', 'HK', 'IN', 'ID', | |||
| 'IT', 'JP', 'KR', 'MY', 'MX', 'NL', 'NZ', 'NO', 'CN', 'PL', 'PT', 'PH', 'RU', 'SA', | |||
| 'ZA', 'ES', 'SE', 'CH', 'TW', 'TR', 'GB', 'US']) | |||
| self.check_valid_value(self.language, "Bing Languages", | |||
| ['ar', 'eu', 'bn', 'bg', 'ca', 'ns', 'nt', 'hr', 'cs', 'da', 'nl', 'en', 'gb', 'et', | |||
| 'fi', 'fr', 'gl', 'de', 'gu', 'he', 'hi', 'hu', 'is', 'it', 'jp', 'kn', 'ko', 'lv', | |||
| 'lt', 'ms', 'ml', 'mr', 'nb', 'pl', 'br', 'pt', 'pa', 'ro', 'ru', 'sr', 'sk', 'sl', | |||
| 'es', 'sv', 'ta', 'te', 'th', 'tr', 'uk', 'vi']) | |||
| class Bing(ComponentBase, ABC): | |||
| component_name = "Bing" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return Bing.be_output("") | |||
| try: | |||
| headers = {"Ocp-Apim-Subscription-Key": self._param.api_key, 'Accept-Language': self._param.language} | |||
| params = {"q": ans, "textDecorations": True, "textFormat": "HTML", "cc": self._param.country, | |||
| "answerCount": 1, "promote": self._param.channel} | |||
| if self._param.channel == "Webpages": | |||
| response = requests.get("https://api.bing.microsoft.com/v7.0/search", headers=headers, params=params) | |||
| response.raise_for_status() | |||
| search_results = response.json() | |||
| bing_res = [{"content": '<a href="' + i["url"] + '">' + i["name"] + '</a> ' + i["snippet"]} for i in | |||
| search_results["webPages"]["value"]] | |||
| elif self._param.channel == "News": | |||
| response = requests.get("https://api.bing.microsoft.com/v7.0/news/search", headers=headers, | |||
| params=params) | |||
| response.raise_for_status() | |||
| search_results = response.json() | |||
| bing_res = [{"content": '<a href="' + i["url"] + '">' + i["name"] + '</a> ' + i["description"]} for i | |||
| in search_results['news']['value']] | |||
| except Exception as e: | |||
| return Bing.be_output("**ERROR**: " + str(e)) | |||
| if not bing_res: | |||
| return Bing.be_output("") | |||
| df = pd.DataFrame(bing_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import requests | |||
| import pandas as pd | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class BingParam(ComponentParamBase): | |||
| """ | |||
| Define the Bing component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 10 | |||
| self.channel = "Webpages" | |||
| self.api_key = "YOUR_ACCESS_KEY" | |||
| self.country = "CN" | |||
| self.language = "en" | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_valid_value(self.channel, "Bing Web Search or Bing News", ["Webpages", "News"]) | |||
| self.check_empty(self.api_key, "Bing subscription key") | |||
| self.check_valid_value(self.country, "Bing Country", | |||
| ['AR', 'AU', 'AT', 'BE', 'BR', 'CA', 'CL', 'DK', 'FI', 'FR', 'DE', 'HK', 'IN', 'ID', | |||
| 'IT', 'JP', 'KR', 'MY', 'MX', 'NL', 'NZ', 'NO', 'CN', 'PL', 'PT', 'PH', 'RU', 'SA', | |||
| 'ZA', 'ES', 'SE', 'CH', 'TW', 'TR', 'GB', 'US']) | |||
| self.check_valid_value(self.language, "Bing Languages", | |||
| ['ar', 'eu', 'bn', 'bg', 'ca', 'ns', 'nt', 'hr', 'cs', 'da', 'nl', 'en', 'gb', 'et', | |||
| 'fi', 'fr', 'gl', 'de', 'gu', 'he', 'hi', 'hu', 'is', 'it', 'jp', 'kn', 'ko', 'lv', | |||
| 'lt', 'ms', 'ml', 'mr', 'nb', 'pl', 'br', 'pt', 'pa', 'ro', 'ru', 'sr', 'sk', 'sl', | |||
| 'es', 'sv', 'ta', 'te', 'th', 'tr', 'uk', 'vi']) | |||
| class Bing(ComponentBase, ABC): | |||
| component_name = "Bing" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return Bing.be_output("") | |||
| try: | |||
| headers = {"Ocp-Apim-Subscription-Key": self._param.api_key, 'Accept-Language': self._param.language} | |||
| params = {"q": ans, "textDecorations": True, "textFormat": "HTML", "cc": self._param.country, | |||
| "answerCount": 1, "promote": self._param.channel} | |||
| if self._param.channel == "Webpages": | |||
| response = requests.get("https://api.bing.microsoft.com/v7.0/search", headers=headers, params=params) | |||
| response.raise_for_status() | |||
| search_results = response.json() | |||
| bing_res = [{"content": '<a href="' + i["url"] + '">' + i["name"] + '</a> ' + i["snippet"]} for i in | |||
| search_results["webPages"]["value"]] | |||
| elif self._param.channel == "News": | |||
| response = requests.get("https://api.bing.microsoft.com/v7.0/news/search", headers=headers, | |||
| params=params) | |||
| response.raise_for_status() | |||
| search_results = response.json() | |||
| bing_res = [{"content": '<a href="' + i["url"] + '">' + i["name"] + '</a> ' + i["description"]} for i | |||
| in search_results['news']['value']] | |||
| except Exception as e: | |||
| return Bing.be_output("**ERROR**: " + str(e)) | |||
| if not bing_res: | |||
| return Bing.be_output("") | |||
| df = pd.DataFrame(bing_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| @@ -1,62 +1,62 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import re | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| import deepl | |||
| class DeepLParam(ComponentParamBase): | |||
| """ | |||
| Define the DeepL component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.auth_key = "xxx" | |||
| self.parameters = [] | |||
| self.source_lang = 'ZH' | |||
| self.target_lang = 'EN-GB' | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_valid_value(self.source_lang, "Source language", | |||
| ['AR', 'BG', 'CS', 'DA', 'DE', 'EL', 'EN', 'ES', 'ET', 'FI', 'FR', 'HU', 'ID', 'IT', | |||
| 'JA', 'KO', 'LT', 'LV', 'NB', 'NL', 'PL', 'PT', 'RO', 'RU', 'SK', 'SL', 'SV', 'TR', | |||
| 'UK', 'ZH']) | |||
| self.check_valid_value(self.target_lang, "Target language", | |||
| ['AR', 'BG', 'CS', 'DA', 'DE', 'EL', 'EN-GB', 'EN-US', 'ES', 'ET', 'FI', 'FR', 'HU', | |||
| 'ID', 'IT', 'JA', 'KO', 'LT', 'LV', 'NB', 'NL', 'PL', 'PT-BR', 'PT-PT', 'RO', 'RU', | |||
| 'SK', 'SL', 'SV', 'TR', 'UK', 'ZH']) | |||
| class DeepL(ComponentBase, ABC): | |||
| component_name = "GitHub" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return DeepL.be_output("") | |||
| try: | |||
| translator = deepl.Translator(self._param.auth_key) | |||
| result = translator.translate_text(ans, source_lang=self._param.source_lang, | |||
| target_lang=self._param.target_lang) | |||
| return DeepL.be_output(result.text) | |||
| except Exception as e: | |||
| DeepL.be_output("**Error**:" + str(e)) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import re | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| import deepl | |||
| class DeepLParam(ComponentParamBase): | |||
| """ | |||
| Define the DeepL component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.auth_key = "xxx" | |||
| self.parameters = [] | |||
| self.source_lang = 'ZH' | |||
| self.target_lang = 'EN-GB' | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_valid_value(self.source_lang, "Source language", | |||
| ['AR', 'BG', 'CS', 'DA', 'DE', 'EL', 'EN', 'ES', 'ET', 'FI', 'FR', 'HU', 'ID', 'IT', | |||
| 'JA', 'KO', 'LT', 'LV', 'NB', 'NL', 'PL', 'PT', 'RO', 'RU', 'SK', 'SL', 'SV', 'TR', | |||
| 'UK', 'ZH']) | |||
| self.check_valid_value(self.target_lang, "Target language", | |||
| ['AR', 'BG', 'CS', 'DA', 'DE', 'EL', 'EN-GB', 'EN-US', 'ES', 'ET', 'FI', 'FR', 'HU', | |||
| 'ID', 'IT', 'JA', 'KO', 'LT', 'LV', 'NB', 'NL', 'PL', 'PT-BR', 'PT-PT', 'RO', 'RU', | |||
| 'SK', 'SL', 'SV', 'TR', 'UK', 'ZH']) | |||
| class DeepL(ComponentBase, ABC): | |||
| component_name = "GitHub" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return DeepL.be_output("") | |||
| try: | |||
| translator = deepl.Translator(self._param.auth_key) | |||
| result = translator.translate_text(ans, source_lang=self._param.source_lang, | |||
| target_lang=self._param.target_lang) | |||
| return DeepL.be_output(result.text) | |||
| except Exception as e: | |||
| DeepL.be_output("**Error**:" + str(e)) | |||
| @@ -1,61 +1,61 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import pandas as pd | |||
| import requests | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class GitHubParam(ComponentParamBase): | |||
| """ | |||
| Define the GitHub component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 10 | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| class GitHub(ComponentBase, ABC): | |||
| component_name = "GitHub" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return GitHub.be_output("") | |||
| try: | |||
| url = 'https://api.github.com/search/repositories?q=' + ans + '&sort=stars&order=desc&per_page=' + str( | |||
| self._param.top_n) | |||
| headers = {"Content-Type": "application/vnd.github+json", "X-GitHub-Api-Version": '2022-11-28'} | |||
| response = requests.get(url=url, headers=headers).json() | |||
| github_res = [{"content": '<a href="' + i["html_url"] + '">' + i["name"] + '</a>' + str( | |||
| i["description"]) + '\n stars:' + str(i['watchers'])} for i in response['items']] | |||
| except Exception as e: | |||
| return GitHub.be_output("**ERROR**: " + str(e)) | |||
| if not github_res: | |||
| return GitHub.be_output("") | |||
| df = pd.DataFrame(github_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import pandas as pd | |||
| import requests | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class GitHubParam(ComponentParamBase): | |||
| """ | |||
| Define the GitHub component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 10 | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| class GitHub(ComponentBase, ABC): | |||
| component_name = "GitHub" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return GitHub.be_output("") | |||
| try: | |||
| url = 'https://api.github.com/search/repositories?q=' + ans + '&sort=stars&order=desc&per_page=' + str( | |||
| self._param.top_n) | |||
| headers = {"Content-Type": "application/vnd.github+json", "X-GitHub-Api-Version": '2022-11-28'} | |||
| response = requests.get(url=url, headers=headers).json() | |||
| github_res = [{"content": '<a href="' + i["html_url"] + '">' + i["name"] + '</a>' + str( | |||
| i["description"]) + '\n stars:' + str(i['watchers'])} for i in response['items']] | |||
| except Exception as e: | |||
| return GitHub.be_output("**ERROR**: " + str(e)) | |||
| if not github_res: | |||
| return GitHub.be_output("") | |||
| df = pd.DataFrame(github_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| @@ -1,96 +1,96 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| from serpapi import GoogleSearch | |||
| import pandas as pd | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class GoogleParam(ComponentParamBase): | |||
| """ | |||
| Define the Google component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 10 | |||
| self.api_key = "xxx" | |||
| self.country = "cn" | |||
| self.language = "en" | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_empty(self.api_key, "SerpApi API key") | |||
| self.check_valid_value(self.country, "Google Country", | |||
| ['af', 'al', 'dz', 'as', 'ad', 'ao', 'ai', 'aq', 'ag', 'ar', 'am', 'aw', 'au', 'at', | |||
| 'az', 'bs', 'bh', 'bd', 'bb', 'by', 'be', 'bz', 'bj', 'bm', 'bt', 'bo', 'ba', 'bw', | |||
| 'bv', 'br', 'io', 'bn', 'bg', 'bf', 'bi', 'kh', 'cm', 'ca', 'cv', 'ky', 'cf', 'td', | |||
| 'cl', 'cn', 'cx', 'cc', 'co', 'km', 'cg', 'cd', 'ck', 'cr', 'ci', 'hr', 'cu', 'cy', | |||
| 'cz', 'dk', 'dj', 'dm', 'do', 'ec', 'eg', 'sv', 'gq', 'er', 'ee', 'et', 'fk', 'fo', | |||
| 'fj', 'fi', 'fr', 'gf', 'pf', 'tf', 'ga', 'gm', 'ge', 'de', 'gh', 'gi', 'gr', 'gl', | |||
| 'gd', 'gp', 'gu', 'gt', 'gn', 'gw', 'gy', 'ht', 'hm', 'va', 'hn', 'hk', 'hu', 'is', | |||
| 'in', 'id', 'ir', 'iq', 'ie', 'il', 'it', 'jm', 'jp', 'jo', 'kz', 'ke', 'ki', 'kp', | |||
| 'kr', 'kw', 'kg', 'la', 'lv', 'lb', 'ls', 'lr', 'ly', 'li', 'lt', 'lu', 'mo', 'mk', | |||
| 'mg', 'mw', 'my', 'mv', 'ml', 'mt', 'mh', 'mq', 'mr', 'mu', 'yt', 'mx', 'fm', 'md', | |||
| 'mc', 'mn', 'ms', 'ma', 'mz', 'mm', 'na', 'nr', 'np', 'nl', 'an', 'nc', 'nz', 'ni', | |||
| 'ne', 'ng', 'nu', 'nf', 'mp', 'no', 'om', 'pk', 'pw', 'ps', 'pa', 'pg', 'py', 'pe', | |||
| 'ph', 'pn', 'pl', 'pt', 'pr', 'qa', 're', 'ro', 'ru', 'rw', 'sh', 'kn', 'lc', 'pm', | |||
| 'vc', 'ws', 'sm', 'st', 'sa', 'sn', 'rs', 'sc', 'sl', 'sg', 'sk', 'si', 'sb', 'so', | |||
| 'za', 'gs', 'es', 'lk', 'sd', 'sr', 'sj', 'sz', 'se', 'ch', 'sy', 'tw', 'tj', 'tz', | |||
| 'th', 'tl', 'tg', 'tk', 'to', 'tt', 'tn', 'tr', 'tm', 'tc', 'tv', 'ug', 'ua', 'ae', | |||
| 'uk', 'gb', 'us', 'um', 'uy', 'uz', 'vu', 've', 'vn', 'vg', 'vi', 'wf', 'eh', 'ye', | |||
| 'zm', 'zw']) | |||
| self.check_valid_value(self.language, "Google languages", | |||
| ['af', 'ak', 'sq', 'ws', 'am', 'ar', 'hy', 'az', 'eu', 'be', 'bem', 'bn', 'bh', | |||
| 'xx-bork', 'bs', 'br', 'bg', 'bt', 'km', 'ca', 'chr', 'ny', 'zh-cn', 'zh-tw', 'co', | |||
| 'hr', 'cs', 'da', 'nl', 'xx-elmer', 'en', 'eo', 'et', 'ee', 'fo', 'tl', 'fi', 'fr', | |||
| 'fy', 'gaa', 'gl', 'ka', 'de', 'el', 'kl', 'gn', 'gu', 'xx-hacker', 'ht', 'ha', 'haw', | |||
| 'iw', 'hi', 'hu', 'is', 'ig', 'id', 'ia', 'ga', 'it', 'ja', 'jw', 'kn', 'kk', 'rw', | |||
| 'rn', 'xx-klingon', 'kg', 'ko', 'kri', 'ku', 'ckb', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', | |||
| 'loz', 'lg', 'ach', 'mk', 'mg', 'ms', 'ml', 'mt', 'mv', 'mi', 'mr', 'mfe', 'mo', 'mn', | |||
| 'sr-me', 'my', 'ne', 'pcm', 'nso', 'no', 'nn', 'oc', 'or', 'om', 'ps', 'fa', | |||
| 'xx-pirate', 'pl', 'pt', 'pt-br', 'pt-pt', 'pa', 'qu', 'ro', 'rm', 'nyn', 'ru', 'gd', | |||
| 'sr', 'sh', 'st', 'tn', 'crs', 'sn', 'sd', 'si', 'sk', 'sl', 'so', 'es', 'es-419', 'su', | |||
| 'sw', 'sv', 'tg', 'ta', 'tt', 'te', 'th', 'ti', 'to', 'lua', 'tum', 'tr', 'tk', 'tw', | |||
| 'ug', 'uk', 'ur', 'uz', 'vu', 'vi', 'cy', 'wo', 'xh', 'yi', 'yo', 'zu'] | |||
| ) | |||
| class Google(ComponentBase, ABC): | |||
| component_name = "Google" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return Google.be_output("") | |||
| try: | |||
| client = GoogleSearch( | |||
| {"engine": "google", "q": ans, "api_key": self._param.api_key, "gl": self._param.country, | |||
| "hl": self._param.language, "num": self._param.top_n}) | |||
| google_res = [{"content": '<a href="' + i["link"] + '">' + i["title"] + '</a> ' + i["snippet"]} for i in | |||
| client.get_dict()["organic_results"]] | |||
| except Exception as e: | |||
| return Google.be_output("**ERROR**: Existing Unavailable Parameters!") | |||
| if not google_res: | |||
| return Google.be_output("") | |||
| df = pd.DataFrame(google_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| from serpapi import GoogleSearch | |||
| import pandas as pd | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class GoogleParam(ComponentParamBase): | |||
| """ | |||
| Define the Google component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 10 | |||
| self.api_key = "xxx" | |||
| self.country = "cn" | |||
| self.language = "en" | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_empty(self.api_key, "SerpApi API key") | |||
| self.check_valid_value(self.country, "Google Country", | |||
| ['af', 'al', 'dz', 'as', 'ad', 'ao', 'ai', 'aq', 'ag', 'ar', 'am', 'aw', 'au', 'at', | |||
| 'az', 'bs', 'bh', 'bd', 'bb', 'by', 'be', 'bz', 'bj', 'bm', 'bt', 'bo', 'ba', 'bw', | |||
| 'bv', 'br', 'io', 'bn', 'bg', 'bf', 'bi', 'kh', 'cm', 'ca', 'cv', 'ky', 'cf', 'td', | |||
| 'cl', 'cn', 'cx', 'cc', 'co', 'km', 'cg', 'cd', 'ck', 'cr', 'ci', 'hr', 'cu', 'cy', | |||
| 'cz', 'dk', 'dj', 'dm', 'do', 'ec', 'eg', 'sv', 'gq', 'er', 'ee', 'et', 'fk', 'fo', | |||
| 'fj', 'fi', 'fr', 'gf', 'pf', 'tf', 'ga', 'gm', 'ge', 'de', 'gh', 'gi', 'gr', 'gl', | |||
| 'gd', 'gp', 'gu', 'gt', 'gn', 'gw', 'gy', 'ht', 'hm', 'va', 'hn', 'hk', 'hu', 'is', | |||
| 'in', 'id', 'ir', 'iq', 'ie', 'il', 'it', 'jm', 'jp', 'jo', 'kz', 'ke', 'ki', 'kp', | |||
| 'kr', 'kw', 'kg', 'la', 'lv', 'lb', 'ls', 'lr', 'ly', 'li', 'lt', 'lu', 'mo', 'mk', | |||
| 'mg', 'mw', 'my', 'mv', 'ml', 'mt', 'mh', 'mq', 'mr', 'mu', 'yt', 'mx', 'fm', 'md', | |||
| 'mc', 'mn', 'ms', 'ma', 'mz', 'mm', 'na', 'nr', 'np', 'nl', 'an', 'nc', 'nz', 'ni', | |||
| 'ne', 'ng', 'nu', 'nf', 'mp', 'no', 'om', 'pk', 'pw', 'ps', 'pa', 'pg', 'py', 'pe', | |||
| 'ph', 'pn', 'pl', 'pt', 'pr', 'qa', 're', 'ro', 'ru', 'rw', 'sh', 'kn', 'lc', 'pm', | |||
| 'vc', 'ws', 'sm', 'st', 'sa', 'sn', 'rs', 'sc', 'sl', 'sg', 'sk', 'si', 'sb', 'so', | |||
| 'za', 'gs', 'es', 'lk', 'sd', 'sr', 'sj', 'sz', 'se', 'ch', 'sy', 'tw', 'tj', 'tz', | |||
| 'th', 'tl', 'tg', 'tk', 'to', 'tt', 'tn', 'tr', 'tm', 'tc', 'tv', 'ug', 'ua', 'ae', | |||
| 'uk', 'gb', 'us', 'um', 'uy', 'uz', 'vu', 've', 'vn', 'vg', 'vi', 'wf', 'eh', 'ye', | |||
| 'zm', 'zw']) | |||
| self.check_valid_value(self.language, "Google languages", | |||
| ['af', 'ak', 'sq', 'ws', 'am', 'ar', 'hy', 'az', 'eu', 'be', 'bem', 'bn', 'bh', | |||
| 'xx-bork', 'bs', 'br', 'bg', 'bt', 'km', 'ca', 'chr', 'ny', 'zh-cn', 'zh-tw', 'co', | |||
| 'hr', 'cs', 'da', 'nl', 'xx-elmer', 'en', 'eo', 'et', 'ee', 'fo', 'tl', 'fi', 'fr', | |||
| 'fy', 'gaa', 'gl', 'ka', 'de', 'el', 'kl', 'gn', 'gu', 'xx-hacker', 'ht', 'ha', 'haw', | |||
| 'iw', 'hi', 'hu', 'is', 'ig', 'id', 'ia', 'ga', 'it', 'ja', 'jw', 'kn', 'kk', 'rw', | |||
| 'rn', 'xx-klingon', 'kg', 'ko', 'kri', 'ku', 'ckb', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', | |||
| 'loz', 'lg', 'ach', 'mk', 'mg', 'ms', 'ml', 'mt', 'mv', 'mi', 'mr', 'mfe', 'mo', 'mn', | |||
| 'sr-me', 'my', 'ne', 'pcm', 'nso', 'no', 'nn', 'oc', 'or', 'om', 'ps', 'fa', | |||
| 'xx-pirate', 'pl', 'pt', 'pt-br', 'pt-pt', 'pa', 'qu', 'ro', 'rm', 'nyn', 'ru', 'gd', | |||
| 'sr', 'sh', 'st', 'tn', 'crs', 'sn', 'sd', 'si', 'sk', 'sl', 'so', 'es', 'es-419', 'su', | |||
| 'sw', 'sv', 'tg', 'ta', 'tt', 'te', 'th', 'ti', 'to', 'lua', 'tum', 'tr', 'tk', 'tw', | |||
| 'ug', 'uk', 'ur', 'uz', 'vu', 'vi', 'cy', 'wo', 'xh', 'yi', 'yo', 'zu'] | |||
| ) | |||
| class Google(ComponentBase, ABC): | |||
| component_name = "Google" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return Google.be_output("") | |||
| try: | |||
| client = GoogleSearch( | |||
| {"engine": "google", "q": ans, "api_key": self._param.api_key, "gl": self._param.country, | |||
| "hl": self._param.language, "num": self._param.top_n}) | |||
| google_res = [{"content": '<a href="' + i["link"] + '">' + i["title"] + '</a> ' + i["snippet"]} for i in | |||
| client.get_dict()["organic_results"]] | |||
| except Exception as e: | |||
| return Google.be_output("**ERROR**: Existing Unavailable Parameters!") | |||
| if not google_res: | |||
| return Google.be_output("") | |||
| df = pd.DataFrame(google_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| @@ -1,70 +1,70 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import pandas as pd | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| from scholarly import scholarly | |||
| class GoogleScholarParam(ComponentParamBase): | |||
| """ | |||
| Define the GoogleScholar component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 6 | |||
| self.sort_by = 'relevance' | |||
| self.year_low = None | |||
| self.year_high = None | |||
| self.patents = True | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_valid_value(self.sort_by, "GoogleScholar Sort_by", ['date', 'relevance']) | |||
| self.check_boolean(self.patents, "Whether or not to include patents, defaults to True") | |||
| class GoogleScholar(ComponentBase, ABC): | |||
| component_name = "GoogleScholar" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return GoogleScholar.be_output("") | |||
| scholar_client = scholarly.search_pubs(ans, patents=self._param.patents, year_low=self._param.year_low, | |||
| year_high=self._param.year_high, sort_by=self._param.sort_by) | |||
| scholar_res = [] | |||
| for i in range(self._param.top_n): | |||
| try: | |||
| pub = next(scholar_client) | |||
| scholar_res.append({"content": 'Title: ' + pub['bib']['title'] + '\n_Url: <a href="' + pub[ | |||
| 'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[ | |||
| 'bib'].get('abstract', 'no abstract')}) | |||
| except StopIteration or Exception as e: | |||
| print("**ERROR** " + str(e)) | |||
| break | |||
| if not scholar_res: | |||
| return GoogleScholar.be_output("") | |||
| df = pd.DataFrame(scholar_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import pandas as pd | |||
| from agent.settings import DEBUG | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| from scholarly import scholarly | |||
| class GoogleScholarParam(ComponentParamBase): | |||
| """ | |||
| Define the GoogleScholar component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.top_n = 6 | |||
| self.sort_by = 'relevance' | |||
| self.year_low = None | |||
| self.year_high = None | |||
| self.patents = True | |||
| def check(self): | |||
| self.check_positive_integer(self.top_n, "Top N") | |||
| self.check_valid_value(self.sort_by, "GoogleScholar Sort_by", ['date', 'relevance']) | |||
| self.check_boolean(self.patents, "Whether or not to include patents, defaults to True") | |||
| class GoogleScholar(ComponentBase, ABC): | |||
| component_name = "GoogleScholar" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = " - ".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return GoogleScholar.be_output("") | |||
| scholar_client = scholarly.search_pubs(ans, patents=self._param.patents, year_low=self._param.year_low, | |||
| year_high=self._param.year_high, sort_by=self._param.sort_by) | |||
| scholar_res = [] | |||
| for i in range(self._param.top_n): | |||
| try: | |||
| pub = next(scholar_client) | |||
| scholar_res.append({"content": 'Title: ' + pub['bib']['title'] + '\n_Url: <a href="' + pub[ | |||
| 'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[ | |||
| 'bib'].get('abstract', 'no abstract')}) | |||
| except StopIteration or Exception as e: | |||
| print("**ERROR** " + str(e)) | |||
| break | |||
| if not scholar_res: | |||
| return GoogleScholar.be_output("") | |||
| df = pd.DataFrame(scholar_res) | |||
| if DEBUG: print(df, ":::::::::::::::::::::::::::::::::") | |||
| return df | |||
| @@ -1,111 +1,111 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import pandas as pd | |||
| import requests | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class QWeatherParam(ComponentParamBase): | |||
| """ | |||
| Define the QWeather component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.web_apikey = "xxx" | |||
| self.lang = "zh" | |||
| self.type = "weather" | |||
| self.user_type = 'free' | |||
| self.error_code = { | |||
| "204": "The request was successful, but the region you are querying does not have the data you need at this time.", | |||
| "400": "Request error, may contain incorrect request parameters or missing mandatory request parameters.", | |||
| "401": "Authentication fails, possibly using the wrong KEY, wrong digital signature, wrong type of KEY (e.g. using the SDK's KEY to access the Web API).", | |||
| "402": "Exceeded the number of accesses or the balance is not enough to support continued access to the service, you can recharge, upgrade the accesses or wait for the accesses to be reset.", | |||
| "403": "No access, may be the binding PackageName, BundleID, domain IP address is inconsistent, or the data that requires additional payment.", | |||
| "404": "The queried data or region does not exist.", | |||
| "429": "Exceeded the limited QPM (number of accesses per minute), please refer to the QPM description", | |||
| "500": "No response or timeout, interface service abnormality please contact us" | |||
| } | |||
| # Weather | |||
| self.time_period = 'now' | |||
| def check(self): | |||
| self.check_empty(self.web_apikey, "BaiduFanyi APPID") | |||
| self.check_valid_value(self.type, "Type", ["weather", "indices", "airquality"]) | |||
| self.check_valid_value(self.user_type, "Free subscription or paid subscription", ["free", "paid"]) | |||
| self.check_valid_value(self.lang, "Use language", | |||
| ['zh', 'zh-hant', 'en', 'de', 'es', 'fr', 'it', 'ja', 'ko', 'ru', 'hi', 'th', 'ar', 'pt', | |||
| 'bn', 'ms', 'nl', 'el', 'la', 'sv', 'id', 'pl', 'tr', 'cs', 'et', 'vi', 'fil', 'fi', | |||
| 'he', 'is', 'nb']) | |||
| self.check_vaild_value(self.time_period, "Time period", ['now', '3d', '7d', '10d', '15d', '30d']) | |||
| class QWeather(ComponentBase, ABC): | |||
| component_name = "QWeather" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = "".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return QWeather.be_output("") | |||
| try: | |||
| response = requests.get( | |||
| url="https://geoapi.qweather.com/v2/city/lookup?location=" + ans + "&key=" + self._param.web_apikey).json() | |||
| if response["code"] == "200": | |||
| location_id = response["location"][0]["id"] | |||
| else: | |||
| return QWeather.be_output("**Error**" + self._param.error_code[response["code"]]) | |||
| base_url = "https://api.qweather.com/v7/" if self._param.user_type == 'paid' else "https://devapi.qweather.com/v7/" | |||
| if self._param.type == "weather": | |||
| url = base_url + "weather/" + self._param.time_period + "?location=" + location_id + "&key=" + self._param.web_apikey + "&lang=" + self._param.lang | |||
| response = requests.get(url=url).json() | |||
| if response["code"] == "200": | |||
| if self._param.time_period == "now": | |||
| return QWeather.be_output(str(response["now"])) | |||
| else: | |||
| qweather_res = [{"content": str(i) + "\n"} for i in response["daily"]] | |||
| if not qweather_res: | |||
| return QWeather.be_output("") | |||
| df = pd.DataFrame(qweather_res) | |||
| return df | |||
| else: | |||
| return QWeather.be_output("**Error**" + self._param.error_code[response["code"]]) | |||
| elif self._param.type == "indices": | |||
| url = base_url + "indices/1d?type=0&location=" + location_id + "&key=" + self._param.web_apikey + "&lang=" + self._param.lang | |||
| response = requests.get(url=url).json() | |||
| if response["code"] == "200": | |||
| indices_res = response["daily"][0]["date"] + "\n" + "\n".join( | |||
| [i["name"] + ": " + i["category"] + ", " + i["text"] for i in response["daily"]]) | |||
| return QWeather.be_output(indices_res) | |||
| else: | |||
| return QWeather.be_output("**Error**" + self._param.error_code[response["code"]]) | |||
| elif self._param.type == "airquality": | |||
| url = base_url + "air/now?location=" + location_id + "&key=" + self._param.web_apikey + "&lang=" + self._param.lang | |||
| response = requests.get(url=url).json() | |||
| if response["code"] == "200": | |||
| return QWeather.be_output(str(response["now"])) | |||
| else: | |||
| return QWeather.be_output("**Error**" + self._param.error_code[response["code"]]) | |||
| except Exception as e: | |||
| return QWeather.be_output("**Error**" + str(e)) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from abc import ABC | |||
| import pandas as pd | |||
| import requests | |||
| from agent.component.base import ComponentBase, ComponentParamBase | |||
| class QWeatherParam(ComponentParamBase): | |||
| """ | |||
| Define the QWeather component parameters. | |||
| """ | |||
| def __init__(self): | |||
| super().__init__() | |||
| self.web_apikey = "xxx" | |||
| self.lang = "zh" | |||
| self.type = "weather" | |||
| self.user_type = 'free' | |||
| self.error_code = { | |||
| "204": "The request was successful, but the region you are querying does not have the data you need at this time.", | |||
| "400": "Request error, may contain incorrect request parameters or missing mandatory request parameters.", | |||
| "401": "Authentication fails, possibly using the wrong KEY, wrong digital signature, wrong type of KEY (e.g. using the SDK's KEY to access the Web API).", | |||
| "402": "Exceeded the number of accesses or the balance is not enough to support continued access to the service, you can recharge, upgrade the accesses or wait for the accesses to be reset.", | |||
| "403": "No access, may be the binding PackageName, BundleID, domain IP address is inconsistent, or the data that requires additional payment.", | |||
| "404": "The queried data or region does not exist.", | |||
| "429": "Exceeded the limited QPM (number of accesses per minute), please refer to the QPM description", | |||
| "500": "No response or timeout, interface service abnormality please contact us" | |||
| } | |||
| # Weather | |||
| self.time_period = 'now' | |||
| def check(self): | |||
| self.check_empty(self.web_apikey, "BaiduFanyi APPID") | |||
| self.check_valid_value(self.type, "Type", ["weather", "indices", "airquality"]) | |||
| self.check_valid_value(self.user_type, "Free subscription or paid subscription", ["free", "paid"]) | |||
| self.check_valid_value(self.lang, "Use language", | |||
| ['zh', 'zh-hant', 'en', 'de', 'es', 'fr', 'it', 'ja', 'ko', 'ru', 'hi', 'th', 'ar', 'pt', | |||
| 'bn', 'ms', 'nl', 'el', 'la', 'sv', 'id', 'pl', 'tr', 'cs', 'et', 'vi', 'fil', 'fi', | |||
| 'he', 'is', 'nb']) | |||
| self.check_vaild_value(self.time_period, "Time period", ['now', '3d', '7d', '10d', '15d', '30d']) | |||
| class QWeather(ComponentBase, ABC): | |||
| component_name = "QWeather" | |||
| def _run(self, history, **kwargs): | |||
| ans = self.get_input() | |||
| ans = "".join(ans["content"]) if "content" in ans else "" | |||
| if not ans: | |||
| return QWeather.be_output("") | |||
| try: | |||
| response = requests.get( | |||
| url="https://geoapi.qweather.com/v2/city/lookup?location=" + ans + "&key=" + self._param.web_apikey).json() | |||
| if response["code"] == "200": | |||
| location_id = response["location"][0]["id"] | |||
| else: | |||
| return QWeather.be_output("**Error**" + self._param.error_code[response["code"]]) | |||
| base_url = "https://api.qweather.com/v7/" if self._param.user_type == 'paid' else "https://devapi.qweather.com/v7/" | |||
| if self._param.type == "weather": | |||
| url = base_url + "weather/" + self._param.time_period + "?location=" + location_id + "&key=" + self._param.web_apikey + "&lang=" + self._param.lang | |||
| response = requests.get(url=url).json() | |||
| if response["code"] == "200": | |||
| if self._param.time_period == "now": | |||
| return QWeather.be_output(str(response["now"])) | |||
| else: | |||
| qweather_res = [{"content": str(i) + "\n"} for i in response["daily"]] | |||
| if not qweather_res: | |||
| return QWeather.be_output("") | |||
| df = pd.DataFrame(qweather_res) | |||
| return df | |||
| else: | |||
| return QWeather.be_output("**Error**" + self._param.error_code[response["code"]]) | |||
| elif self._param.type == "indices": | |||
| url = base_url + "indices/1d?type=0&location=" + location_id + "&key=" + self._param.web_apikey + "&lang=" + self._param.lang | |||
| response = requests.get(url=url).json() | |||
| if response["code"] == "200": | |||
| indices_res = response["daily"][0]["date"] + "\n" + "\n".join( | |||
| [i["name"] + ": " + i["category"] + ", " + i["text"] for i in response["daily"]]) | |||
| return QWeather.be_output(indices_res) | |||
| else: | |||
| return QWeather.be_output("**Error**" + self._param.error_code[response["code"]]) | |||
| elif self._param.type == "airquality": | |||
| url = base_url + "air/now?location=" + location_id + "&key=" + self._param.web_apikey + "&lang=" + self._param.lang | |||
| response = requests.get(url=url).json() | |||
| if response["code"] == "200": | |||
| return QWeather.be_output(str(response["now"])) | |||
| else: | |||
| return QWeather.be_output("**Error**" + self._param.error_code[response["code"]]) | |||
| except Exception as e: | |||
| return QWeather.be_output("**Error**" + str(e)) | |||
| @@ -1,62 +1,62 @@ | |||
| { | |||
| "components": { | |||
| "begin": { | |||
| "obj":{ | |||
| "component_name": "Begin", | |||
| "params": { | |||
| "prologue": "Hi there!" | |||
| } | |||
| }, | |||
| "downstream": ["answer:0"], | |||
| "upstream": [] | |||
| }, | |||
| "answer:0": { | |||
| "obj": { | |||
| "component_name": "Answer", | |||
| "params": {} | |||
| }, | |||
| "downstream": ["keyword:0"], | |||
| "upstream": ["begin"] | |||
| }, | |||
| "keyword:0": { | |||
| "obj": { | |||
| "component_name": "KeywordExtract", | |||
| "params": { | |||
| "llm_id": "deepseek-chat", | |||
| "prompt": "- Role: You're a question analyzer.\n - Requirements:\n - Summarize user's question, and give top %s important keyword/phrase.\n - Use comma as a delimiter to separate keywords/phrases.\n - Answer format: (in language of user's question)\n - keyword: ", | |||
| "temperature": 0.2, | |||
| "top_n": 1 | |||
| } | |||
| }, | |||
| "downstream": ["wikipedia:0"], | |||
| "upstream": ["answer:0"] | |||
| }, | |||
| "wikipedia:0": { | |||
| "obj":{ | |||
| "component_name": "Wikipedia", | |||
| "params": { | |||
| "top_n": 10 | |||
| } | |||
| }, | |||
| "downstream": ["generate:0"], | |||
| "upstream": ["keyword:0"] | |||
| }, | |||
| "generate:1": { | |||
| "obj": { | |||
| "component_name": "Generate", | |||
| "params": { | |||
| "llm_id": "deepseek-chat", | |||
| "prompt": "You are an intelligent assistant. Please answer the question based on content from Wikipedia. When the answer from Wikipedia is incomplete, you need to output the URL link of the corresponding content as well. When all the content searched from Wikipedia is irrelevant to the question, your answer must include the sentence, \"The answer you are looking for is not found in the Wikipedia!\". Answers need to consider chat history.\n The content of Wikipedia is as follows:\n {input}\n The above is the content of Wikipedia.", | |||
| "temperature": 0.2 | |||
| } | |||
| }, | |||
| "downstream": ["answer:0"], | |||
| "upstream": ["wikipedia:0"] | |||
| } | |||
| }, | |||
| "history": [], | |||
| "path": [], | |||
| "messages": [], | |||
| "reference": {}, | |||
| "answer": [] | |||
| } | |||
| { | |||
| "components": { | |||
| "begin": { | |||
| "obj":{ | |||
| "component_name": "Begin", | |||
| "params": { | |||
| "prologue": "Hi there!" | |||
| } | |||
| }, | |||
| "downstream": ["answer:0"], | |||
| "upstream": [] | |||
| }, | |||
| "answer:0": { | |||
| "obj": { | |||
| "component_name": "Answer", | |||
| "params": {} | |||
| }, | |||
| "downstream": ["keyword:0"], | |||
| "upstream": ["begin"] | |||
| }, | |||
| "keyword:0": { | |||
| "obj": { | |||
| "component_name": "KeywordExtract", | |||
| "params": { | |||
| "llm_id": "deepseek-chat", | |||
| "prompt": "- Role: You're a question analyzer.\n - Requirements:\n - Summarize user's question, and give top %s important keyword/phrase.\n - Use comma as a delimiter to separate keywords/phrases.\n - Answer format: (in language of user's question)\n - keyword: ", | |||
| "temperature": 0.2, | |||
| "top_n": 1 | |||
| } | |||
| }, | |||
| "downstream": ["wikipedia:0"], | |||
| "upstream": ["answer:0"] | |||
| }, | |||
| "wikipedia:0": { | |||
| "obj":{ | |||
| "component_name": "Wikipedia", | |||
| "params": { | |||
| "top_n": 10 | |||
| } | |||
| }, | |||
| "downstream": ["generate:0"], | |||
| "upstream": ["keyword:0"] | |||
| }, | |||
| "generate:1": { | |||
| "obj": { | |||
| "component_name": "Generate", | |||
| "params": { | |||
| "llm_id": "deepseek-chat", | |||
| "prompt": "You are an intelligent assistant. Please answer the question based on content from Wikipedia. When the answer from Wikipedia is incomplete, you need to output the URL link of the corresponding content as well. When all the content searched from Wikipedia is irrelevant to the question, your answer must include the sentence, \"The answer you are looking for is not found in the Wikipedia!\". Answers need to consider chat history.\n The content of Wikipedia is as follows:\n {input}\n The above is the content of Wikipedia.", | |||
| "temperature": 0.2 | |||
| } | |||
| }, | |||
| "downstream": ["answer:0"], | |||
| "upstream": ["wikipedia:0"] | |||
| } | |||
| }, | |||
| "history": [], | |||
| "path": [], | |||
| "messages": [], | |||
| "reference": {}, | |||
| "answer": [] | |||
| } | |||
| @@ -1,125 +1,125 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import logging | |||
| import os | |||
| import sys | |||
| from importlib.util import module_from_spec, spec_from_file_location | |||
| from pathlib import Path | |||
| from flask import Blueprint, Flask | |||
| from werkzeug.wrappers.request import Request | |||
| from flask_cors import CORS | |||
| from api.db import StatusEnum | |||
| from api.db.db_models import close_connection | |||
| from api.db.services import UserService | |||
| from api.utils import CustomJSONEncoder, commands | |||
| from flask_session import Session | |||
| from flask_login import LoginManager | |||
| from api.settings import SECRET_KEY, stat_logger | |||
| from api.settings import API_VERSION, access_logger | |||
| from api.utils.api_utils import server_error_response | |||
| from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer | |||
| __all__ = ['app'] | |||
| logger = logging.getLogger('flask.app') | |||
| for h in access_logger.handlers: | |||
| logger.addHandler(h) | |||
| Request.json = property(lambda self: self.get_json(force=True, silent=True)) | |||
| app = Flask(__name__) | |||
| CORS(app, supports_credentials=True,max_age=2592000) | |||
| app.url_map.strict_slashes = False | |||
| app.json_encoder = CustomJSONEncoder | |||
| app.errorhandler(Exception)(server_error_response) | |||
| ## convince for dev and debug | |||
| #app.config["LOGIN_DISABLED"] = True | |||
| app.config["SESSION_PERMANENT"] = False | |||
| app.config["SESSION_TYPE"] = "filesystem" | |||
| app.config['MAX_CONTENT_LENGTH'] = int(os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024)) | |||
| Session(app) | |||
| login_manager = LoginManager() | |||
| login_manager.init_app(app) | |||
| commands.register_commands(app) | |||
| def search_pages_path(pages_dir): | |||
| app_path_list = [path for path in pages_dir.glob('*_app.py') if not path.name.startswith('.')] | |||
| api_path_list = [path for path in pages_dir.glob('*_api.py') if not path.name.startswith('.')] | |||
| app_path_list.extend(api_path_list) | |||
| return app_path_list | |||
| def register_page(page_path): | |||
| path = f'{page_path}' | |||
| page_name = page_path.stem.rstrip('_api') if "_api" in path else page_path.stem.rstrip('_app') | |||
| module_name = '.'.join(page_path.parts[page_path.parts.index('api'):-1] + (page_name,)) | |||
| spec = spec_from_file_location(module_name, page_path) | |||
| page = module_from_spec(spec) | |||
| page.app = app | |||
| page.manager = Blueprint(page_name, module_name) | |||
| sys.modules[module_name] = page | |||
| spec.loader.exec_module(page) | |||
| page_name = getattr(page, 'page_name', page_name) | |||
| url_prefix = f'/api/{API_VERSION}/{page_name}' if "_api" in path else f'/{API_VERSION}/{page_name}' | |||
| app.register_blueprint(page.manager, url_prefix=url_prefix) | |||
| return url_prefix | |||
| pages_dir = [ | |||
| Path(__file__).parent, | |||
| Path(__file__).parent.parent / 'api' / 'apps', # FIXME: ragflow/api/api/apps, can be remove? | |||
| ] | |||
| client_urls_prefix = [ | |||
| register_page(path) | |||
| for dir in pages_dir | |||
| for path in search_pages_path(dir) | |||
| ] | |||
| @login_manager.request_loader | |||
| def load_user(web_request): | |||
| jwt = Serializer(secret_key=SECRET_KEY) | |||
| authorization = web_request.headers.get("Authorization") | |||
| if authorization: | |||
| try: | |||
| access_token = str(jwt.loads(authorization)) | |||
| user = UserService.query(access_token=access_token, status=StatusEnum.VALID.value) | |||
| if user: | |||
| return user[0] | |||
| else: | |||
| return None | |||
| except Exception as e: | |||
| stat_logger.exception(e) | |||
| return None | |||
| else: | |||
| return None | |||
| @app.teardown_request | |||
| def _db_close(exc): | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import logging | |||
| import os | |||
| import sys | |||
| from importlib.util import module_from_spec, spec_from_file_location | |||
| from pathlib import Path | |||
| from flask import Blueprint, Flask | |||
| from werkzeug.wrappers.request import Request | |||
| from flask_cors import CORS | |||
| from api.db import StatusEnum | |||
| from api.db.db_models import close_connection | |||
| from api.db.services import UserService | |||
| from api.utils import CustomJSONEncoder, commands | |||
| from flask_session import Session | |||
| from flask_login import LoginManager | |||
| from api.settings import SECRET_KEY, stat_logger | |||
| from api.settings import API_VERSION, access_logger | |||
| from api.utils.api_utils import server_error_response | |||
| from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer | |||
| __all__ = ['app'] | |||
| logger = logging.getLogger('flask.app') | |||
| for h in access_logger.handlers: | |||
| logger.addHandler(h) | |||
| Request.json = property(lambda self: self.get_json(force=True, silent=True)) | |||
| app = Flask(__name__) | |||
| CORS(app, supports_credentials=True,max_age=2592000) | |||
| app.url_map.strict_slashes = False | |||
| app.json_encoder = CustomJSONEncoder | |||
| app.errorhandler(Exception)(server_error_response) | |||
| ## convince for dev and debug | |||
| #app.config["LOGIN_DISABLED"] = True | |||
| app.config["SESSION_PERMANENT"] = False | |||
| app.config["SESSION_TYPE"] = "filesystem" | |||
| app.config['MAX_CONTENT_LENGTH'] = int(os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024)) | |||
| Session(app) | |||
| login_manager = LoginManager() | |||
| login_manager.init_app(app) | |||
| commands.register_commands(app) | |||
| def search_pages_path(pages_dir): | |||
| app_path_list = [path for path in pages_dir.glob('*_app.py') if not path.name.startswith('.')] | |||
| api_path_list = [path for path in pages_dir.glob('*_api.py') if not path.name.startswith('.')] | |||
| app_path_list.extend(api_path_list) | |||
| return app_path_list | |||
| def register_page(page_path): | |||
| path = f'{page_path}' | |||
| page_name = page_path.stem.rstrip('_api') if "_api" in path else page_path.stem.rstrip('_app') | |||
| module_name = '.'.join(page_path.parts[page_path.parts.index('api'):-1] + (page_name,)) | |||
| spec = spec_from_file_location(module_name, page_path) | |||
| page = module_from_spec(spec) | |||
| page.app = app | |||
| page.manager = Blueprint(page_name, module_name) | |||
| sys.modules[module_name] = page | |||
| spec.loader.exec_module(page) | |||
| page_name = getattr(page, 'page_name', page_name) | |||
| url_prefix = f'/api/{API_VERSION}/{page_name}' if "_api" in path else f'/{API_VERSION}/{page_name}' | |||
| app.register_blueprint(page.manager, url_prefix=url_prefix) | |||
| return url_prefix | |||
| pages_dir = [ | |||
| Path(__file__).parent, | |||
| Path(__file__).parent.parent / 'api' / 'apps', # FIXME: ragflow/api/api/apps, can be remove? | |||
| ] | |||
| client_urls_prefix = [ | |||
| register_page(path) | |||
| for dir in pages_dir | |||
| for path in search_pages_path(dir) | |||
| ] | |||
| @login_manager.request_loader | |||
| def load_user(web_request): | |||
| jwt = Serializer(secret_key=SECRET_KEY) | |||
| authorization = web_request.headers.get("Authorization") | |||
| if authorization: | |||
| try: | |||
| access_token = str(jwt.loads(authorization)) | |||
| user = UserService.query(access_token=access_token, status=StatusEnum.VALID.value) | |||
| if user: | |||
| return user[0] | |||
| else: | |||
| return None | |||
| except Exception as e: | |||
| stat_logger.exception(e) | |||
| return None | |||
| else: | |||
| return None | |||
| @app.teardown_request | |||
| def _db_close(exc): | |||
| close_connection() | |||
| @@ -1,318 +1,318 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import datetime | |||
| import json | |||
| import traceback | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from elasticsearch_dsl import Q | |||
| from rag.app.qa import rmPrefix, beAdoc | |||
| from rag.nlp import search, rag_tokenizer, keyword_extraction | |||
| from rag.utils.es_conn import ELASTICSEARCH | |||
| from rag.utils import rmSpace | |||
| from api.db import LLMType, ParserType | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.llm_service import TenantLLMService | |||
| from api.db.services.user_service import UserTenantService | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.db.services.document_service import DocumentService | |||
| from api.settings import RetCode, retrievaler, kg_retrievaler | |||
| from api.utils.api_utils import get_json_result | |||
| import hashlib | |||
| import re | |||
| @manager.route('/list', methods=['POST']) | |||
| @login_required | |||
| @validate_request("doc_id") | |||
| def list_chunk(): | |||
| req = request.json | |||
| doc_id = req["doc_id"] | |||
| page = int(req.get("page", 1)) | |||
| size = int(req.get("size", 30)) | |||
| question = req.get("keywords", "") | |||
| try: | |||
| tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | |||
| if not tenant_id: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| e, doc = DocumentService.get_by_id(doc_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Document not found!") | |||
| query = { | |||
| "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True | |||
| } | |||
| if "available_int" in req: | |||
| query["available_int"] = int(req["available_int"]) | |||
| sres = retrievaler.search(query, search.index_name(tenant_id)) | |||
| res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} | |||
| for id in sres.ids: | |||
| d = { | |||
| "chunk_id": id, | |||
| "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[ | |||
| id].get( | |||
| "content_with_weight", ""), | |||
| "doc_id": sres.field[id]["doc_id"], | |||
| "docnm_kwd": sres.field[id]["docnm_kwd"], | |||
| "important_kwd": sres.field[id].get("important_kwd", []), | |||
| "img_id": sres.field[id].get("img_id", ""), | |||
| "available_int": sres.field[id].get("available_int", 1), | |||
| "positions": sres.field[id].get("position_int", "").split("\t") | |||
| } | |||
| if len(d["positions"]) % 5 == 0: | |||
| poss = [] | |||
| for i in range(0, len(d["positions"]), 5): | |||
| poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), | |||
| float(d["positions"][i + 3]), float(d["positions"][i + 4])]) | |||
| d["positions"] = poss | |||
| res["chunks"].append(d) | |||
| return get_json_result(data=res) | |||
| except Exception as e: | |||
| if str(e).find("not_found") > 0: | |||
| return get_json_result(data=False, retmsg=f'No chunk found!', | |||
| retcode=RetCode.DATA_ERROR) | |||
| return server_error_response(e) | |||
| @manager.route('/get', methods=['GET']) | |||
| @login_required | |||
| def get(): | |||
| chunk_id = request.args["chunk_id"] | |||
| try: | |||
| tenants = UserTenantService.query(user_id=current_user.id) | |||
| if not tenants: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| res = ELASTICSEARCH.get( | |||
| chunk_id, search.index_name( | |||
| tenants[0].tenant_id)) | |||
| if not res.get("found"): | |||
| return server_error_response("Chunk not found") | |||
| id = res["_id"] | |||
| res = res["_source"] | |||
| res["chunk_id"] = id | |||
| k = [] | |||
| for n in res.keys(): | |||
| if re.search(r"(_vec$|_sm_|_tks|_ltks)", n): | |||
| k.append(n) | |||
| for n in k: | |||
| del res[n] | |||
| return get_json_result(data=res) | |||
| except Exception as e: | |||
| if str(e).find("NotFoundError") >= 0: | |||
| return get_json_result(data=False, retmsg=f'Chunk not found!', | |||
| retcode=RetCode.DATA_ERROR) | |||
| return server_error_response(e) | |||
| @manager.route('/set', methods=['POST']) | |||
| @login_required | |||
| @validate_request("doc_id", "chunk_id", "content_with_weight", | |||
| "important_kwd") | |||
| def set(): | |||
| req = request.json | |||
| d = { | |||
| "id": req["chunk_id"], | |||
| "content_with_weight": req["content_with_weight"]} | |||
| d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"]) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["important_kwd"] = req["important_kwd"] | |||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) | |||
| if "available_int" in req: | |||
| d["available_int"] = req["available_int"] | |||
| try: | |||
| tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | |||
| if not tenant_id: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| embd_id = DocumentService.get_embd_id(req["doc_id"]) | |||
| embd_mdl = TenantLLMService.model_instance( | |||
| tenant_id, LLMType.EMBEDDING.value, embd_id) | |||
| e, doc = DocumentService.get_by_id(req["doc_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Document not found!") | |||
| if doc.parser_id == ParserType.QA: | |||
| arr = [ | |||
| t for t in re.split( | |||
| r"[\n\t]", | |||
| req["content_with_weight"]) if len(t) > 1] | |||
| if len(arr) != 2: | |||
| return get_data_error_result( | |||
| retmsg="Q&A must be separated by TAB/ENTER key.") | |||
| q, a = rmPrefix(arr[0]), rmPrefix(arr[1]) | |||
| d = beAdoc(d, arr[0], arr[1], not any( | |||
| [rag_tokenizer.is_chinese(t) for t in q + a])) | |||
| v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) | |||
| v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | |||
| d["q_%d_vec" % len(v)] = v.tolist() | |||
| ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/switch', methods=['POST']) | |||
| @login_required | |||
| @validate_request("chunk_ids", "available_int", "doc_id") | |||
| def switch(): | |||
| req = request.json | |||
| try: | |||
| tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | |||
| if not tenant_id: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]], | |||
| search.index_name(tenant_id)): | |||
| return get_data_error_result(retmsg="Index updating failure") | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/rm', methods=['POST']) | |||
| @login_required | |||
| @validate_request("chunk_ids", "doc_id") | |||
| def rm(): | |||
| req = request.json | |||
| try: | |||
| if not ELASTICSEARCH.deleteByQuery( | |||
| Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)): | |||
| return get_data_error_result(retmsg="Index updating failure") | |||
| e, doc = DocumentService.get_by_id(req["doc_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Document not found!") | |||
| deleted_chunk_ids = req["chunk_ids"] | |||
| chunk_number = len(deleted_chunk_ids) | |||
| DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/create', methods=['POST']) | |||
| @login_required | |||
| @validate_request("doc_id", "content_with_weight") | |||
| def create(): | |||
| req = request.json | |||
| md5 = hashlib.md5() | |||
| md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) | |||
| chunck_id = md5.hexdigest() | |||
| d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), | |||
| "content_with_weight": req["content_with_weight"]} | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["important_kwd"] = req.get("important_kwd", []) | |||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) | |||
| d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | |||
| d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | |||
| try: | |||
| e, doc = DocumentService.get_by_id(req["doc_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Document not found!") | |||
| d["kb_id"] = [doc.kb_id] | |||
| d["docnm_kwd"] = doc.name | |||
| d["doc_id"] = doc.id | |||
| tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | |||
| if not tenant_id: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| embd_id = DocumentService.get_embd_id(req["doc_id"]) | |||
| embd_mdl = TenantLLMService.model_instance( | |||
| tenant_id, LLMType.EMBEDDING.value, embd_id) | |||
| v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) | |||
| v = 0.1 * v[0] + 0.9 * v[1] | |||
| d["q_%d_vec" % len(v)] = v.tolist() | |||
| ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | |||
| DocumentService.increment_chunk_num( | |||
| doc.id, doc.kb_id, c, 1, 0) | |||
| return get_json_result(data={"chunk_id": chunck_id}) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/retrieval_test', methods=['POST']) | |||
| @login_required | |||
| @validate_request("kb_id", "question") | |||
| def retrieval_test(): | |||
| req = request.json | |||
| page = int(req.get("page", 1)) | |||
| size = int(req.get("size", 30)) | |||
| question = req["question"] | |||
| kb_id = req["kb_id"] | |||
| doc_ids = req.get("doc_ids", []) | |||
| similarity_threshold = float(req.get("similarity_threshold", 0.2)) | |||
| vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) | |||
| top = int(req.get("top_k", 1024)) | |||
| try: | |||
| e, kb = KnowledgebaseService.get_by_id(kb_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Knowledgebase not found!") | |||
| embd_mdl = TenantLLMService.model_instance( | |||
| kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id) | |||
| rerank_mdl = None | |||
| if req.get("rerank_id"): | |||
| rerank_mdl = TenantLLMService.model_instance( | |||
| kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"]) | |||
| if req.get("keyword", False): | |||
| chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT) | |||
| question += keyword_extraction(chat_mdl, question) | |||
| retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler | |||
| ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, [kb_id], page, size, | |||
| similarity_threshold, vector_similarity_weight, top, | |||
| doc_ids, rerank_mdl=rerank_mdl) | |||
| for c in ranks["chunks"]: | |||
| if "vector" in c: | |||
| del c["vector"] | |||
| return get_json_result(data=ranks) | |||
| except Exception as e: | |||
| if str(e).find("not_found") > 0: | |||
| return get_json_result(data=False, retmsg=f'No chunk found! Check the chunk status please!', | |||
| retcode=RetCode.DATA_ERROR) | |||
| return server_error_response(e) | |||
| @manager.route('/knowledge_graph', methods=['GET']) | |||
| @login_required | |||
| def knowledge_graph(): | |||
| doc_id = request.args["doc_id"] | |||
| req = { | |||
| "doc_ids":[doc_id], | |||
| "knowledge_graph_kwd": ["graph", "mind_map"] | |||
| } | |||
| tenant_id = DocumentService.get_tenant_id(doc_id) | |||
| sres = retrievaler.search(req, search.index_name(tenant_id)) | |||
| obj = {"graph": {}, "mind_map": {}} | |||
| for id in sres.ids[:2]: | |||
| ty = sres.field[id]["knowledge_graph_kwd"] | |||
| try: | |||
| obj[ty] = json.loads(sres.field[id]["content_with_weight"]) | |||
| except Exception as e: | |||
| print(traceback.format_exc(), flush=True) | |||
| return get_json_result(data=obj) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import datetime | |||
| import json | |||
| import traceback | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from elasticsearch_dsl import Q | |||
| from rag.app.qa import rmPrefix, beAdoc | |||
| from rag.nlp import search, rag_tokenizer, keyword_extraction | |||
| from rag.utils.es_conn import ELASTICSEARCH | |||
| from rag.utils import rmSpace | |||
| from api.db import LLMType, ParserType | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.llm_service import TenantLLMService | |||
| from api.db.services.user_service import UserTenantService | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.db.services.document_service import DocumentService | |||
| from api.settings import RetCode, retrievaler, kg_retrievaler | |||
| from api.utils.api_utils import get_json_result | |||
| import hashlib | |||
| import re | |||
| @manager.route('/list', methods=['POST']) | |||
| @login_required | |||
| @validate_request("doc_id") | |||
| def list_chunk(): | |||
| req = request.json | |||
| doc_id = req["doc_id"] | |||
| page = int(req.get("page", 1)) | |||
| size = int(req.get("size", 30)) | |||
| question = req.get("keywords", "") | |||
| try: | |||
| tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | |||
| if not tenant_id: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| e, doc = DocumentService.get_by_id(doc_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Document not found!") | |||
| query = { | |||
| "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True | |||
| } | |||
| if "available_int" in req: | |||
| query["available_int"] = int(req["available_int"]) | |||
| sres = retrievaler.search(query, search.index_name(tenant_id)) | |||
| res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} | |||
| for id in sres.ids: | |||
| d = { | |||
| "chunk_id": id, | |||
| "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[ | |||
| id].get( | |||
| "content_with_weight", ""), | |||
| "doc_id": sres.field[id]["doc_id"], | |||
| "docnm_kwd": sres.field[id]["docnm_kwd"], | |||
| "important_kwd": sres.field[id].get("important_kwd", []), | |||
| "img_id": sres.field[id].get("img_id", ""), | |||
| "available_int": sres.field[id].get("available_int", 1), | |||
| "positions": sres.field[id].get("position_int", "").split("\t") | |||
| } | |||
| if len(d["positions"]) % 5 == 0: | |||
| poss = [] | |||
| for i in range(0, len(d["positions"]), 5): | |||
| poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), | |||
| float(d["positions"][i + 3]), float(d["positions"][i + 4])]) | |||
| d["positions"] = poss | |||
| res["chunks"].append(d) | |||
| return get_json_result(data=res) | |||
| except Exception as e: | |||
| if str(e).find("not_found") > 0: | |||
| return get_json_result(data=False, retmsg=f'No chunk found!', | |||
| retcode=RetCode.DATA_ERROR) | |||
| return server_error_response(e) | |||
| @manager.route('/get', methods=['GET']) | |||
| @login_required | |||
| def get(): | |||
| chunk_id = request.args["chunk_id"] | |||
| try: | |||
| tenants = UserTenantService.query(user_id=current_user.id) | |||
| if not tenants: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| res = ELASTICSEARCH.get( | |||
| chunk_id, search.index_name( | |||
| tenants[0].tenant_id)) | |||
| if not res.get("found"): | |||
| return server_error_response("Chunk not found") | |||
| id = res["_id"] | |||
| res = res["_source"] | |||
| res["chunk_id"] = id | |||
| k = [] | |||
| for n in res.keys(): | |||
| if re.search(r"(_vec$|_sm_|_tks|_ltks)", n): | |||
| k.append(n) | |||
| for n in k: | |||
| del res[n] | |||
| return get_json_result(data=res) | |||
| except Exception as e: | |||
| if str(e).find("NotFoundError") >= 0: | |||
| return get_json_result(data=False, retmsg=f'Chunk not found!', | |||
| retcode=RetCode.DATA_ERROR) | |||
| return server_error_response(e) | |||
| @manager.route('/set', methods=['POST']) | |||
| @login_required | |||
| @validate_request("doc_id", "chunk_id", "content_with_weight", | |||
| "important_kwd") | |||
| def set(): | |||
| req = request.json | |||
| d = { | |||
| "id": req["chunk_id"], | |||
| "content_with_weight": req["content_with_weight"]} | |||
| d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"]) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["important_kwd"] = req["important_kwd"] | |||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) | |||
| if "available_int" in req: | |||
| d["available_int"] = req["available_int"] | |||
| try: | |||
| tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | |||
| if not tenant_id: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| embd_id = DocumentService.get_embd_id(req["doc_id"]) | |||
| embd_mdl = TenantLLMService.model_instance( | |||
| tenant_id, LLMType.EMBEDDING.value, embd_id) | |||
| e, doc = DocumentService.get_by_id(req["doc_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Document not found!") | |||
| if doc.parser_id == ParserType.QA: | |||
| arr = [ | |||
| t for t in re.split( | |||
| r"[\n\t]", | |||
| req["content_with_weight"]) if len(t) > 1] | |||
| if len(arr) != 2: | |||
| return get_data_error_result( | |||
| retmsg="Q&A must be separated by TAB/ENTER key.") | |||
| q, a = rmPrefix(arr[0]), rmPrefix(arr[1]) | |||
| d = beAdoc(d, arr[0], arr[1], not any( | |||
| [rag_tokenizer.is_chinese(t) for t in q + a])) | |||
| v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) | |||
| v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | |||
| d["q_%d_vec" % len(v)] = v.tolist() | |||
| ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/switch', methods=['POST']) | |||
| @login_required | |||
| @validate_request("chunk_ids", "available_int", "doc_id") | |||
| def switch(): | |||
| req = request.json | |||
| try: | |||
| tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | |||
| if not tenant_id: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]], | |||
| search.index_name(tenant_id)): | |||
| return get_data_error_result(retmsg="Index updating failure") | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/rm', methods=['POST']) | |||
| @login_required | |||
| @validate_request("chunk_ids", "doc_id") | |||
| def rm(): | |||
| req = request.json | |||
| try: | |||
| if not ELASTICSEARCH.deleteByQuery( | |||
| Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)): | |||
| return get_data_error_result(retmsg="Index updating failure") | |||
| e, doc = DocumentService.get_by_id(req["doc_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Document not found!") | |||
| deleted_chunk_ids = req["chunk_ids"] | |||
| chunk_number = len(deleted_chunk_ids) | |||
| DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/create', methods=['POST']) | |||
| @login_required | |||
| @validate_request("doc_id", "content_with_weight") | |||
| def create(): | |||
| req = request.json | |||
| md5 = hashlib.md5() | |||
| md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) | |||
| chunck_id = md5.hexdigest() | |||
| d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), | |||
| "content_with_weight": req["content_with_weight"]} | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["important_kwd"] = req.get("important_kwd", []) | |||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) | |||
| d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | |||
| d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | |||
| try: | |||
| e, doc = DocumentService.get_by_id(req["doc_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Document not found!") | |||
| d["kb_id"] = [doc.kb_id] | |||
| d["docnm_kwd"] = doc.name | |||
| d["doc_id"] = doc.id | |||
| tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | |||
| if not tenant_id: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| embd_id = DocumentService.get_embd_id(req["doc_id"]) | |||
| embd_mdl = TenantLLMService.model_instance( | |||
| tenant_id, LLMType.EMBEDDING.value, embd_id) | |||
| v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) | |||
| v = 0.1 * v[0] + 0.9 * v[1] | |||
| d["q_%d_vec" % len(v)] = v.tolist() | |||
| ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | |||
| DocumentService.increment_chunk_num( | |||
| doc.id, doc.kb_id, c, 1, 0) | |||
| return get_json_result(data={"chunk_id": chunck_id}) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/retrieval_test', methods=['POST']) | |||
| @login_required | |||
| @validate_request("kb_id", "question") | |||
| def retrieval_test(): | |||
| req = request.json | |||
| page = int(req.get("page", 1)) | |||
| size = int(req.get("size", 30)) | |||
| question = req["question"] | |||
| kb_id = req["kb_id"] | |||
| doc_ids = req.get("doc_ids", []) | |||
| similarity_threshold = float(req.get("similarity_threshold", 0.2)) | |||
| vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) | |||
| top = int(req.get("top_k", 1024)) | |||
| try: | |||
| e, kb = KnowledgebaseService.get_by_id(kb_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Knowledgebase not found!") | |||
| embd_mdl = TenantLLMService.model_instance( | |||
| kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id) | |||
| rerank_mdl = None | |||
| if req.get("rerank_id"): | |||
| rerank_mdl = TenantLLMService.model_instance( | |||
| kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"]) | |||
| if req.get("keyword", False): | |||
| chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT) | |||
| question += keyword_extraction(chat_mdl, question) | |||
| retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler | |||
| ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, [kb_id], page, size, | |||
| similarity_threshold, vector_similarity_weight, top, | |||
| doc_ids, rerank_mdl=rerank_mdl) | |||
| for c in ranks["chunks"]: | |||
| if "vector" in c: | |||
| del c["vector"] | |||
| return get_json_result(data=ranks) | |||
| except Exception as e: | |||
| if str(e).find("not_found") > 0: | |||
| return get_json_result(data=False, retmsg=f'No chunk found! Check the chunk status please!', | |||
| retcode=RetCode.DATA_ERROR) | |||
| return server_error_response(e) | |||
| @manager.route('/knowledge_graph', methods=['GET']) | |||
| @login_required | |||
| def knowledge_graph(): | |||
| doc_id = request.args["doc_id"] | |||
| req = { | |||
| "doc_ids":[doc_id], | |||
| "knowledge_graph_kwd": ["graph", "mind_map"] | |||
| } | |||
| tenant_id = DocumentService.get_tenant_id(doc_id) | |||
| sres = retrievaler.search(req, search.index_name(tenant_id)) | |||
| obj = {"graph": {}, "mind_map": {}} | |||
| for id in sres.ids[:2]: | |||
| ty = sres.field[id]["knowledge_graph_kwd"] | |||
| try: | |||
| obj[ty] = json.loads(sres.field[id]["content_with_weight"]) | |||
| except Exception as e: | |||
| print(traceback.format_exc(), flush=True) | |||
| return get_json_result(data=obj) | |||
| @@ -1,177 +1,177 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from copy import deepcopy | |||
| from flask import request, Response | |||
| from flask_login import login_required | |||
| from api.db.services.dialog_service import DialogService, ConversationService, chat | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.utils import get_uuid | |||
| from api.utils.api_utils import get_json_result | |||
| import json | |||
| @manager.route('/set', methods=['POST']) | |||
| @login_required | |||
| def set_conversation(): | |||
| req = request.json | |||
| conv_id = req.get("conversation_id") | |||
| if conv_id: | |||
| del req["conversation_id"] | |||
| try: | |||
| if not ConversationService.update_by_id(conv_id, req): | |||
| return get_data_error_result(retmsg="Conversation not found!") | |||
| e, conv = ConversationService.get_by_id(conv_id) | |||
| if not e: | |||
| return get_data_error_result( | |||
| retmsg="Fail to update a conversation!") | |||
| conv = conv.to_dict() | |||
| return get_json_result(data=conv) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| try: | |||
| e, dia = DialogService.get_by_id(req["dialog_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Dialog not found") | |||
| conv = { | |||
| "id": get_uuid(), | |||
| "dialog_id": req["dialog_id"], | |||
| "name": req.get("name", "New conversation"), | |||
| "message": [{"role": "assistant", "content": dia.prompt_config["prologue"]}] | |||
| } | |||
| ConversationService.save(**conv) | |||
| e, conv = ConversationService.get_by_id(conv["id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Fail to new a conversation!") | |||
| conv = conv.to_dict() | |||
| return get_json_result(data=conv) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/get', methods=['GET']) | |||
| @login_required | |||
| def get(): | |||
| conv_id = request.args["conversation_id"] | |||
| try: | |||
| e, conv = ConversationService.get_by_id(conv_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Conversation not found!") | |||
| conv = conv.to_dict() | |||
| return get_json_result(data=conv) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/rm', methods=['POST']) | |||
| @login_required | |||
| def rm(): | |||
| conv_ids = request.json["conversation_ids"] | |||
| try: | |||
| for cid in conv_ids: | |||
| ConversationService.delete_by_id(cid) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/list', methods=['GET']) | |||
| @login_required | |||
| def list_convsersation(): | |||
| dialog_id = request.args["dialog_id"] | |||
| try: | |||
| convs = ConversationService.query( | |||
| dialog_id=dialog_id, | |||
| order_by=ConversationService.model.create_time, | |||
| reverse=True) | |||
| convs = [d.to_dict() for d in convs] | |||
| return get_json_result(data=convs) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/completion', methods=['POST']) | |||
| @login_required | |||
| #@validate_request("conversation_id", "messages") | |||
| def completion(): | |||
| req = request.json | |||
| #req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [ | |||
| # {"role": "user", "content": "上海有吗?"} | |||
| #]} | |||
| msg = [] | |||
| for m in req["messages"]: | |||
| if m["role"] == "system": | |||
| continue | |||
| if m["role"] == "assistant" and not msg: | |||
| continue | |||
| msg.append({"role": m["role"], "content": m["content"]}) | |||
| if "doc_ids" in m: | |||
| msg[-1]["doc_ids"] = m["doc_ids"] | |||
| try: | |||
| e, conv = ConversationService.get_by_id(req["conversation_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Conversation not found!") | |||
| conv.message.append(deepcopy(msg[-1])) | |||
| e, dia = DialogService.get_by_id(conv.dialog_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Dialog not found!") | |||
| del req["conversation_id"] | |||
| del req["messages"] | |||
| if not conv.reference: | |||
| conv.reference = [] | |||
| conv.message.append({"role": "assistant", "content": ""}) | |||
| conv.reference.append({"chunks": [], "doc_aggs": []}) | |||
| def fillin_conv(ans): | |||
| nonlocal conv | |||
| if not conv.reference: | |||
| conv.reference.append(ans["reference"]) | |||
| else: conv.reference[-1] = ans["reference"] | |||
| conv.message[-1] = {"role": "assistant", "content": ans["answer"]} | |||
| def stream(): | |||
| nonlocal dia, msg, req, conv | |||
| try: | |||
| for ans in chat(dia, msg, True, **req): | |||
| fillin_conv(ans) | |||
| yield "data:"+json.dumps({"retcode": 0, "retmsg": "", "data": ans}, ensure_ascii=False) + "\n\n" | |||
| ConversationService.update_by_id(conv.id, conv.to_dict()) | |||
| except Exception as e: | |||
| yield "data:" + json.dumps({"retcode": 500, "retmsg": str(e), | |||
| "data": {"answer": "**ERROR**: "+str(e), "reference": []}}, | |||
| ensure_ascii=False) + "\n\n" | |||
| yield "data:"+json.dumps({"retcode": 0, "retmsg": "", "data": True}, ensure_ascii=False) + "\n\n" | |||
| if req.get("stream", True): | |||
| resp = Response(stream(), mimetype="text/event-stream") | |||
| resp.headers.add_header("Cache-control", "no-cache") | |||
| resp.headers.add_header("Connection", "keep-alive") | |||
| resp.headers.add_header("X-Accel-Buffering", "no") | |||
| resp.headers.add_header("Content-Type", "text/event-stream; charset=utf-8") | |||
| return resp | |||
| else: | |||
| answer = None | |||
| for ans in chat(dia, msg, **req): | |||
| answer = ans | |||
| fillin_conv(ans) | |||
| ConversationService.update_by_id(conv.id, conv.to_dict()) | |||
| break | |||
| return get_json_result(data=answer) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from copy import deepcopy | |||
| from flask import request, Response | |||
| from flask_login import login_required | |||
| from api.db.services.dialog_service import DialogService, ConversationService, chat | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.utils import get_uuid | |||
| from api.utils.api_utils import get_json_result | |||
| import json | |||
| @manager.route('/set', methods=['POST']) | |||
| @login_required | |||
| def set_conversation(): | |||
| req = request.json | |||
| conv_id = req.get("conversation_id") | |||
| if conv_id: | |||
| del req["conversation_id"] | |||
| try: | |||
| if not ConversationService.update_by_id(conv_id, req): | |||
| return get_data_error_result(retmsg="Conversation not found!") | |||
| e, conv = ConversationService.get_by_id(conv_id) | |||
| if not e: | |||
| return get_data_error_result( | |||
| retmsg="Fail to update a conversation!") | |||
| conv = conv.to_dict() | |||
| return get_json_result(data=conv) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| try: | |||
| e, dia = DialogService.get_by_id(req["dialog_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Dialog not found") | |||
| conv = { | |||
| "id": get_uuid(), | |||
| "dialog_id": req["dialog_id"], | |||
| "name": req.get("name", "New conversation"), | |||
| "message": [{"role": "assistant", "content": dia.prompt_config["prologue"]}] | |||
| } | |||
| ConversationService.save(**conv) | |||
| e, conv = ConversationService.get_by_id(conv["id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Fail to new a conversation!") | |||
| conv = conv.to_dict() | |||
| return get_json_result(data=conv) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/get', methods=['GET']) | |||
| @login_required | |||
| def get(): | |||
| conv_id = request.args["conversation_id"] | |||
| try: | |||
| e, conv = ConversationService.get_by_id(conv_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Conversation not found!") | |||
| conv = conv.to_dict() | |||
| return get_json_result(data=conv) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/rm', methods=['POST']) | |||
| @login_required | |||
| def rm(): | |||
| conv_ids = request.json["conversation_ids"] | |||
| try: | |||
| for cid in conv_ids: | |||
| ConversationService.delete_by_id(cid) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/list', methods=['GET']) | |||
| @login_required | |||
| def list_convsersation(): | |||
| dialog_id = request.args["dialog_id"] | |||
| try: | |||
| convs = ConversationService.query( | |||
| dialog_id=dialog_id, | |||
| order_by=ConversationService.model.create_time, | |||
| reverse=True) | |||
| convs = [d.to_dict() for d in convs] | |||
| return get_json_result(data=convs) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/completion', methods=['POST']) | |||
| @login_required | |||
| #@validate_request("conversation_id", "messages") | |||
| def completion(): | |||
| req = request.json | |||
| #req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [ | |||
| # {"role": "user", "content": "上海有吗?"} | |||
| #]} | |||
| msg = [] | |||
| for m in req["messages"]: | |||
| if m["role"] == "system": | |||
| continue | |||
| if m["role"] == "assistant" and not msg: | |||
| continue | |||
| msg.append({"role": m["role"], "content": m["content"]}) | |||
| if "doc_ids" in m: | |||
| msg[-1]["doc_ids"] = m["doc_ids"] | |||
| try: | |||
| e, conv = ConversationService.get_by_id(req["conversation_id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Conversation not found!") | |||
| conv.message.append(deepcopy(msg[-1])) | |||
| e, dia = DialogService.get_by_id(conv.dialog_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Dialog not found!") | |||
| del req["conversation_id"] | |||
| del req["messages"] | |||
| if not conv.reference: | |||
| conv.reference = [] | |||
| conv.message.append({"role": "assistant", "content": ""}) | |||
| conv.reference.append({"chunks": [], "doc_aggs": []}) | |||
| def fillin_conv(ans): | |||
| nonlocal conv | |||
| if not conv.reference: | |||
| conv.reference.append(ans["reference"]) | |||
| else: conv.reference[-1] = ans["reference"] | |||
| conv.message[-1] = {"role": "assistant", "content": ans["answer"]} | |||
| def stream(): | |||
| nonlocal dia, msg, req, conv | |||
| try: | |||
| for ans in chat(dia, msg, True, **req): | |||
| fillin_conv(ans) | |||
| yield "data:"+json.dumps({"retcode": 0, "retmsg": "", "data": ans}, ensure_ascii=False) + "\n\n" | |||
| ConversationService.update_by_id(conv.id, conv.to_dict()) | |||
| except Exception as e: | |||
| yield "data:" + json.dumps({"retcode": 500, "retmsg": str(e), | |||
| "data": {"answer": "**ERROR**: "+str(e), "reference": []}}, | |||
| ensure_ascii=False) + "\n\n" | |||
| yield "data:"+json.dumps({"retcode": 0, "retmsg": "", "data": True}, ensure_ascii=False) + "\n\n" | |||
| if req.get("stream", True): | |||
| resp = Response(stream(), mimetype="text/event-stream") | |||
| resp.headers.add_header("Cache-control", "no-cache") | |||
| resp.headers.add_header("Connection", "keep-alive") | |||
| resp.headers.add_header("X-Accel-Buffering", "no") | |||
| resp.headers.add_header("Content-Type", "text/event-stream; charset=utf-8") | |||
| return resp | |||
| else: | |||
| answer = None | |||
| for ans in chat(dia, msg, **req): | |||
| answer = ans | |||
| fillin_conv(ans) | |||
| ConversationService.update_by_id(conv.id, conv.to_dict()) | |||
| break | |||
| return get_json_result(data=answer) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @@ -1,172 +1,172 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from api.db.services.dialog_service import DialogService | |||
| from api.db import StatusEnum | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.user_service import TenantService | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.utils import get_uuid | |||
| from api.utils.api_utils import get_json_result | |||
| @manager.route('/set', methods=['POST']) | |||
| @login_required | |||
| def set_dialog(): | |||
| req = request.json | |||
| dialog_id = req.get("dialog_id") | |||
| name = req.get("name", "New Dialog") | |||
| description = req.get("description", "A helpful Dialog") | |||
| icon = req.get("icon", "") | |||
| top_n = req.get("top_n", 6) | |||
| top_k = req.get("top_k", 1024) | |||
| rerank_id = req.get("rerank_id", "") | |||
| if not rerank_id: req["rerank_id"] = "" | |||
| similarity_threshold = req.get("similarity_threshold", 0.1) | |||
| vector_similarity_weight = req.get("vector_similarity_weight", 0.3) | |||
| if vector_similarity_weight is None: vector_similarity_weight = 0.3 | |||
| llm_setting = req.get("llm_setting", {}) | |||
| default_prompt = { | |||
| "system": """你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。 | |||
| 以下是知识库: | |||
| {knowledge} | |||
| 以上是知识库。""", | |||
| "prologue": "您好,我是您的助手小樱,长得可爱又善良,can I help you?", | |||
| "parameters": [ | |||
| {"key": "knowledge", "optional": False} | |||
| ], | |||
| "empty_response": "Sorry! 知识库中未找到相关内容!" | |||
| } | |||
| prompt_config = req.get("prompt_config", default_prompt) | |||
| if not prompt_config["system"]: | |||
| prompt_config["system"] = default_prompt["system"] | |||
| # if len(prompt_config["parameters"]) < 1: | |||
| # prompt_config["parameters"] = default_prompt["parameters"] | |||
| # for p in prompt_config["parameters"]: | |||
| # if p["key"] == "knowledge":break | |||
| # else: prompt_config["parameters"].append(default_prompt["parameters"][0]) | |||
| for p in prompt_config["parameters"]: | |||
| if p["optional"]: | |||
| continue | |||
| if prompt_config["system"].find("{%s}" % p["key"]) < 0: | |||
| return get_data_error_result( | |||
| retmsg="Parameter '{}' is not used".format(p["key"])) | |||
| try: | |||
| e, tenant = TenantService.get_by_id(current_user.id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| llm_id = req.get("llm_id", tenant.llm_id) | |||
| if not dialog_id: | |||
| if not req.get("kb_ids"): | |||
| return get_data_error_result( | |||
| retmsg="Fail! Please select knowledgebase!") | |||
| dia = { | |||
| "id": get_uuid(), | |||
| "tenant_id": current_user.id, | |||
| "name": name, | |||
| "kb_ids": req["kb_ids"], | |||
| "description": description, | |||
| "llm_id": llm_id, | |||
| "llm_setting": llm_setting, | |||
| "prompt_config": prompt_config, | |||
| "top_n": top_n, | |||
| "top_k": top_k, | |||
| "rerank_id": rerank_id, | |||
| "similarity_threshold": similarity_threshold, | |||
| "vector_similarity_weight": vector_similarity_weight, | |||
| "icon": icon | |||
| } | |||
| if not DialogService.save(**dia): | |||
| return get_data_error_result(retmsg="Fail to new a dialog!") | |||
| e, dia = DialogService.get_by_id(dia["id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Fail to new a dialog!") | |||
| return get_json_result(data=dia.to_json()) | |||
| else: | |||
| del req["dialog_id"] | |||
| if "kb_names" in req: | |||
| del req["kb_names"] | |||
| if not DialogService.update_by_id(dialog_id, req): | |||
| return get_data_error_result(retmsg="Dialog not found!") | |||
| e, dia = DialogService.get_by_id(dialog_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Fail to update a dialog!") | |||
| dia = dia.to_dict() | |||
| dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"]) | |||
| return get_json_result(data=dia) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/get', methods=['GET']) | |||
| @login_required | |||
| def get(): | |||
| dialog_id = request.args["dialog_id"] | |||
| try: | |||
| e, dia = DialogService.get_by_id(dialog_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Dialog not found!") | |||
| dia = dia.to_dict() | |||
| dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"]) | |||
| return get_json_result(data=dia) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| def get_kb_names(kb_ids): | |||
| ids, nms = [], [] | |||
| for kid in kb_ids: | |||
| e, kb = KnowledgebaseService.get_by_id(kid) | |||
| if not e or kb.status != StatusEnum.VALID.value: | |||
| continue | |||
| ids.append(kid) | |||
| nms.append(kb.name) | |||
| return ids, nms | |||
| @manager.route('/list', methods=['GET']) | |||
| @login_required | |||
| def list_dialogs(): | |||
| try: | |||
| diags = DialogService.query( | |||
| tenant_id=current_user.id, | |||
| status=StatusEnum.VALID.value, | |||
| reverse=True, | |||
| order_by=DialogService.model.create_time) | |||
| diags = [d.to_dict() for d in diags] | |||
| for d in diags: | |||
| d["kb_ids"], d["kb_names"] = get_kb_names(d["kb_ids"]) | |||
| return get_json_result(data=diags) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/rm', methods=['POST']) | |||
| @login_required | |||
| @validate_request("dialog_ids") | |||
| def rm(): | |||
| req = request.json | |||
| try: | |||
| DialogService.update_many_by_id( | |||
| [{"id": id, "status": StatusEnum.INVALID.value} for id in req["dialog_ids"]]) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from api.db.services.dialog_service import DialogService | |||
| from api.db import StatusEnum | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.user_service import TenantService | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.utils import get_uuid | |||
| from api.utils.api_utils import get_json_result | |||
| @manager.route('/set', methods=['POST']) | |||
| @login_required | |||
| def set_dialog(): | |||
| req = request.json | |||
| dialog_id = req.get("dialog_id") | |||
| name = req.get("name", "New Dialog") | |||
| description = req.get("description", "A helpful Dialog") | |||
| icon = req.get("icon", "") | |||
| top_n = req.get("top_n", 6) | |||
| top_k = req.get("top_k", 1024) | |||
| rerank_id = req.get("rerank_id", "") | |||
| if not rerank_id: req["rerank_id"] = "" | |||
| similarity_threshold = req.get("similarity_threshold", 0.1) | |||
| vector_similarity_weight = req.get("vector_similarity_weight", 0.3) | |||
| if vector_similarity_weight is None: vector_similarity_weight = 0.3 | |||
| llm_setting = req.get("llm_setting", {}) | |||
| default_prompt = { | |||
| "system": """你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。 | |||
| 以下是知识库: | |||
| {knowledge} | |||
| 以上是知识库。""", | |||
| "prologue": "您好,我是您的助手小樱,长得可爱又善良,can I help you?", | |||
| "parameters": [ | |||
| {"key": "knowledge", "optional": False} | |||
| ], | |||
| "empty_response": "Sorry! 知识库中未找到相关内容!" | |||
| } | |||
| prompt_config = req.get("prompt_config", default_prompt) | |||
| if not prompt_config["system"]: | |||
| prompt_config["system"] = default_prompt["system"] | |||
| # if len(prompt_config["parameters"]) < 1: | |||
| # prompt_config["parameters"] = default_prompt["parameters"] | |||
| # for p in prompt_config["parameters"]: | |||
| # if p["key"] == "knowledge":break | |||
| # else: prompt_config["parameters"].append(default_prompt["parameters"][0]) | |||
| for p in prompt_config["parameters"]: | |||
| if p["optional"]: | |||
| continue | |||
| if prompt_config["system"].find("{%s}" % p["key"]) < 0: | |||
| return get_data_error_result( | |||
| retmsg="Parameter '{}' is not used".format(p["key"])) | |||
| try: | |||
| e, tenant = TenantService.get_by_id(current_user.id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Tenant not found!") | |||
| llm_id = req.get("llm_id", tenant.llm_id) | |||
| if not dialog_id: | |||
| if not req.get("kb_ids"): | |||
| return get_data_error_result( | |||
| retmsg="Fail! Please select knowledgebase!") | |||
| dia = { | |||
| "id": get_uuid(), | |||
| "tenant_id": current_user.id, | |||
| "name": name, | |||
| "kb_ids": req["kb_ids"], | |||
| "description": description, | |||
| "llm_id": llm_id, | |||
| "llm_setting": llm_setting, | |||
| "prompt_config": prompt_config, | |||
| "top_n": top_n, | |||
| "top_k": top_k, | |||
| "rerank_id": rerank_id, | |||
| "similarity_threshold": similarity_threshold, | |||
| "vector_similarity_weight": vector_similarity_weight, | |||
| "icon": icon | |||
| } | |||
| if not DialogService.save(**dia): | |||
| return get_data_error_result(retmsg="Fail to new a dialog!") | |||
| e, dia = DialogService.get_by_id(dia["id"]) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Fail to new a dialog!") | |||
| return get_json_result(data=dia.to_json()) | |||
| else: | |||
| del req["dialog_id"] | |||
| if "kb_names" in req: | |||
| del req["kb_names"] | |||
| if not DialogService.update_by_id(dialog_id, req): | |||
| return get_data_error_result(retmsg="Dialog not found!") | |||
| e, dia = DialogService.get_by_id(dialog_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Fail to update a dialog!") | |||
| dia = dia.to_dict() | |||
| dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"]) | |||
| return get_json_result(data=dia) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/get', methods=['GET']) | |||
| @login_required | |||
| def get(): | |||
| dialog_id = request.args["dialog_id"] | |||
| try: | |||
| e, dia = DialogService.get_by_id(dialog_id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Dialog not found!") | |||
| dia = dia.to_dict() | |||
| dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"]) | |||
| return get_json_result(data=dia) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| def get_kb_names(kb_ids): | |||
| ids, nms = [], [] | |||
| for kid in kb_ids: | |||
| e, kb = KnowledgebaseService.get_by_id(kid) | |||
| if not e or kb.status != StatusEnum.VALID.value: | |||
| continue | |||
| ids.append(kid) | |||
| nms.append(kb.name) | |||
| return ids, nms | |||
| @manager.route('/list', methods=['GET']) | |||
| @login_required | |||
| def list_dialogs(): | |||
| try: | |||
| diags = DialogService.query( | |||
| tenant_id=current_user.id, | |||
| status=StatusEnum.VALID.value, | |||
| reverse=True, | |||
| order_by=DialogService.model.create_time) | |||
| diags = [d.to_dict() for d in diags] | |||
| for d in diags: | |||
| d["kb_ids"], d["kb_names"] = get_kb_names(d["kb_ids"]) | |||
| return get_json_result(data=diags) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/rm', methods=['POST']) | |||
| @login_required | |||
| @validate_request("dialog_ids") | |||
| def rm(): | |||
| req = request.json | |||
| try: | |||
| DialogService.update_many_by_id( | |||
| [{"id": id, "status": StatusEnum.INVALID.value} for id in req["dialog_ids"]]) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @@ -1,153 +1,153 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from elasticsearch_dsl import Q | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from api.db.services import duplicate_name | |||
| from api.db.services.document_service import DocumentService | |||
| from api.db.services.file2document_service import File2DocumentService | |||
| from api.db.services.file_service import FileService | |||
| from api.db.services.user_service import TenantService, UserTenantService | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.utils import get_uuid, get_format_time | |||
| from api.db import StatusEnum, UserTenantRole, FileSource | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.db_models import Knowledgebase, File | |||
| from api.settings import stat_logger, RetCode | |||
| from api.utils.api_utils import get_json_result | |||
| from rag.nlp import search | |||
| from rag.utils.es_conn import ELASTICSEARCH | |||
| @manager.route('/create', methods=['post']) | |||
| @login_required | |||
| @validate_request("name") | |||
| def create(): | |||
| req = request.json | |||
| req["name"] = req["name"].strip() | |||
| req["name"] = duplicate_name( | |||
| KnowledgebaseService.query, | |||
| name=req["name"], | |||
| tenant_id=current_user.id, | |||
| status=StatusEnum.VALID.value) | |||
| try: | |||
| req["id"] = get_uuid() | |||
| req["tenant_id"] = current_user.id | |||
| req["created_by"] = current_user.id | |||
| e, t = TenantService.get_by_id(current_user.id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Tenant not found.") | |||
| req["embd_id"] = t.embd_id | |||
| if not KnowledgebaseService.save(**req): | |||
| return get_data_error_result() | |||
| return get_json_result(data={"kb_id": req["id"]}) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/update', methods=['post']) | |||
| @login_required | |||
| @validate_request("kb_id", "name", "description", "permission", "parser_id") | |||
| def update(): | |||
| req = request.json | |||
| req["name"] = req["name"].strip() | |||
| try: | |||
| if not KnowledgebaseService.query( | |||
| created_by=current_user.id, id=req["kb_id"]): | |||
| return get_json_result( | |||
| data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR) | |||
| e, kb = KnowledgebaseService.get_by_id(req["kb_id"]) | |||
| if not e: | |||
| return get_data_error_result( | |||
| retmsg="Can't find this knowledgebase!") | |||
| if req["name"].lower() != kb.name.lower() \ | |||
| and len(KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) > 1: | |||
| return get_data_error_result( | |||
| retmsg="Duplicated knowledgebase name.") | |||
| del req["kb_id"] | |||
| if not KnowledgebaseService.update_by_id(kb.id, req): | |||
| return get_data_error_result() | |||
| e, kb = KnowledgebaseService.get_by_id(kb.id) | |||
| if not e: | |||
| return get_data_error_result( | |||
| retmsg="Database error (Knowledgebase rename)!") | |||
| return get_json_result(data=kb.to_json()) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/detail', methods=['GET']) | |||
| @login_required | |||
| def detail(): | |||
| kb_id = request.args["kb_id"] | |||
| try: | |||
| kb = KnowledgebaseService.get_detail(kb_id) | |||
| if not kb: | |||
| return get_data_error_result( | |||
| retmsg="Can't find this knowledgebase!") | |||
| return get_json_result(data=kb) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/list', methods=['GET']) | |||
| @login_required | |||
| def list_kbs(): | |||
| page_number = request.args.get("page", 1) | |||
| items_per_page = request.args.get("page_size", 150) | |||
| orderby = request.args.get("orderby", "create_time") | |||
| desc = request.args.get("desc", True) | |||
| try: | |||
| tenants = TenantService.get_joined_tenants_by_user_id(current_user.id) | |||
| kbs = KnowledgebaseService.get_by_tenant_ids( | |||
| [m["tenant_id"] for m in tenants], current_user.id, page_number, items_per_page, orderby, desc) | |||
| return get_json_result(data=kbs) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/rm', methods=['post']) | |||
| @login_required | |||
| @validate_request("kb_id") | |||
| def rm(): | |||
| req = request.json | |||
| try: | |||
| kbs = KnowledgebaseService.query( | |||
| created_by=current_user.id, id=req["kb_id"]) | |||
| if not kbs: | |||
| return get_json_result( | |||
| data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR) | |||
| for doc in DocumentService.query(kb_id=req["kb_id"]): | |||
| if not DocumentService.remove_document(doc, kbs[0].tenant_id): | |||
| return get_data_error_result( | |||
| retmsg="Database error (Document removal)!") | |||
| f2d = File2DocumentService.get_by_document_id(doc.id) | |||
| FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) | |||
| File2DocumentService.delete_by_document_id(doc.id) | |||
| if not KnowledgebaseService.delete_by_id(req["kb_id"]): | |||
| return get_data_error_result( | |||
| retmsg="Database error (Knowledgebase removal)!") | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from elasticsearch_dsl import Q | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from api.db.services import duplicate_name | |||
| from api.db.services.document_service import DocumentService | |||
| from api.db.services.file2document_service import File2DocumentService | |||
| from api.db.services.file_service import FileService | |||
| from api.db.services.user_service import TenantService, UserTenantService | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.utils import get_uuid, get_format_time | |||
| from api.db import StatusEnum, UserTenantRole, FileSource | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.db_models import Knowledgebase, File | |||
| from api.settings import stat_logger, RetCode | |||
| from api.utils.api_utils import get_json_result | |||
| from rag.nlp import search | |||
| from rag.utils.es_conn import ELASTICSEARCH | |||
| @manager.route('/create', methods=['post']) | |||
| @login_required | |||
| @validate_request("name") | |||
| def create(): | |||
| req = request.json | |||
| req["name"] = req["name"].strip() | |||
| req["name"] = duplicate_name( | |||
| KnowledgebaseService.query, | |||
| name=req["name"], | |||
| tenant_id=current_user.id, | |||
| status=StatusEnum.VALID.value) | |||
| try: | |||
| req["id"] = get_uuid() | |||
| req["tenant_id"] = current_user.id | |||
| req["created_by"] = current_user.id | |||
| e, t = TenantService.get_by_id(current_user.id) | |||
| if not e: | |||
| return get_data_error_result(retmsg="Tenant not found.") | |||
| req["embd_id"] = t.embd_id | |||
| if not KnowledgebaseService.save(**req): | |||
| return get_data_error_result() | |||
| return get_json_result(data={"kb_id": req["id"]}) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/update', methods=['post']) | |||
| @login_required | |||
| @validate_request("kb_id", "name", "description", "permission", "parser_id") | |||
| def update(): | |||
| req = request.json | |||
| req["name"] = req["name"].strip() | |||
| try: | |||
| if not KnowledgebaseService.query( | |||
| created_by=current_user.id, id=req["kb_id"]): | |||
| return get_json_result( | |||
| data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR) | |||
| e, kb = KnowledgebaseService.get_by_id(req["kb_id"]) | |||
| if not e: | |||
| return get_data_error_result( | |||
| retmsg="Can't find this knowledgebase!") | |||
| if req["name"].lower() != kb.name.lower() \ | |||
| and len(KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) > 1: | |||
| return get_data_error_result( | |||
| retmsg="Duplicated knowledgebase name.") | |||
| del req["kb_id"] | |||
| if not KnowledgebaseService.update_by_id(kb.id, req): | |||
| return get_data_error_result() | |||
| e, kb = KnowledgebaseService.get_by_id(kb.id) | |||
| if not e: | |||
| return get_data_error_result( | |||
| retmsg="Database error (Knowledgebase rename)!") | |||
| return get_json_result(data=kb.to_json()) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/detail', methods=['GET']) | |||
| @login_required | |||
| def detail(): | |||
| kb_id = request.args["kb_id"] | |||
| try: | |||
| kb = KnowledgebaseService.get_detail(kb_id) | |||
| if not kb: | |||
| return get_data_error_result( | |||
| retmsg="Can't find this knowledgebase!") | |||
| return get_json_result(data=kb) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/list', methods=['GET']) | |||
| @login_required | |||
| def list_kbs(): | |||
| page_number = request.args.get("page", 1) | |||
| items_per_page = request.args.get("page_size", 150) | |||
| orderby = request.args.get("orderby", "create_time") | |||
| desc = request.args.get("desc", True) | |||
| try: | |||
| tenants = TenantService.get_joined_tenants_by_user_id(current_user.id) | |||
| kbs = KnowledgebaseService.get_by_tenant_ids( | |||
| [m["tenant_id"] for m in tenants], current_user.id, page_number, items_per_page, orderby, desc) | |||
| return get_json_result(data=kbs) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/rm', methods=['post']) | |||
| @login_required | |||
| @validate_request("kb_id") | |||
| def rm(): | |||
| req = request.json | |||
| try: | |||
| kbs = KnowledgebaseService.query( | |||
| created_by=current_user.id, id=req["kb_id"]) | |||
| if not kbs: | |||
| return get_json_result( | |||
| data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR) | |||
| for doc in DocumentService.query(kb_id=req["kb_id"]): | |||
| if not DocumentService.remove_document(doc, kbs[0].tenant_id): | |||
| return get_data_error_result( | |||
| retmsg="Database error (Document removal)!") | |||
| f2d = File2DocumentService.get_by_document_id(doc.id) | |||
| FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) | |||
| File2DocumentService.delete_by_document_id(doc.id) | |||
| if not KnowledgebaseService.delete_by_id(req["kb_id"]): | |||
| return get_data_error_result( | |||
| retmsg="Database error (Knowledgebase removal)!") | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @@ -1,279 +1,279 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from api.db.services.llm_service import LLMFactoriesService, TenantLLMService, LLMService | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.db import StatusEnum, LLMType | |||
| from api.db.db_models import TenantLLM | |||
| from api.utils.api_utils import get_json_result | |||
| from rag.llm import EmbeddingModel, ChatModel, RerankModel,CvModel | |||
| import requests | |||
| import ast | |||
| @manager.route('/factories', methods=['GET']) | |||
| @login_required | |||
| def factories(): | |||
| try: | |||
| fac = LLMFactoriesService.get_all() | |||
| return get_json_result(data=[f.to_dict() for f in fac if f.name not in ["Youdao", "FastEmbed", "BAAI"]]) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/set_api_key', methods=['POST']) | |||
| @login_required | |||
| @validate_request("llm_factory", "api_key") | |||
| def set_api_key(): | |||
| req = request.json | |||
| # test if api key works | |||
| chat_passed, embd_passed, rerank_passed = False, False, False | |||
| factory = req["llm_factory"] | |||
| msg = "" | |||
| for llm in LLMService.query(fid=factory): | |||
| if not embd_passed and llm.model_type == LLMType.EMBEDDING.value: | |||
| mdl = EmbeddingModel[factory]( | |||
| req["api_key"], llm.llm_name, base_url=req.get("base_url")) | |||
| try: | |||
| arr, tc = mdl.encode(["Test if the api key is available"]) | |||
| if len(arr[0]) == 0: | |||
| raise Exception("Fail") | |||
| embd_passed = True | |||
| except Exception as e: | |||
| msg += f"\nFail to access embedding model({llm.llm_name}) using this api key." + str(e) | |||
| elif not chat_passed and llm.model_type == LLMType.CHAT.value: | |||
| mdl = ChatModel[factory]( | |||
| req["api_key"], llm.llm_name, base_url=req.get("base_url")) | |||
| try: | |||
| m, tc = mdl.chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], | |||
| {"temperature": 0.9,'max_tokens':50}) | |||
| if m.find("**ERROR**") >=0: | |||
| raise Exception(m) | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm.llm_name}) using this api key." + str( | |||
| e) | |||
| chat_passed = True | |||
| elif not rerank_passed and llm.model_type == LLMType.RERANK: | |||
| mdl = RerankModel[factory]( | |||
| req["api_key"], llm.llm_name, base_url=req.get("base_url")) | |||
| try: | |||
| arr, tc = mdl.similarity("What's the weather?", ["Is it sunny today?"]) | |||
| if len(arr) == 0 or tc == 0: | |||
| raise Exception("Fail") | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm.llm_name}) using this api key." + str( | |||
| e) | |||
| rerank_passed = True | |||
| if msg: | |||
| return get_data_error_result(retmsg=msg) | |||
| llm = { | |||
| "api_key": req["api_key"], | |||
| "api_base": req.get("base_url", "") | |||
| } | |||
| for n in ["model_type", "llm_name"]: | |||
| if n in req: | |||
| llm[n] = req[n] | |||
| if not TenantLLMService.filter_update( | |||
| [TenantLLM.tenant_id == current_user.id, TenantLLM.llm_factory == factory], llm): | |||
| for llm in LLMService.query(fid=factory): | |||
| TenantLLMService.save( | |||
| tenant_id=current_user.id, | |||
| llm_factory=factory, | |||
| llm_name=llm.llm_name, | |||
| model_type=llm.model_type, | |||
| api_key=req["api_key"], | |||
| api_base=req.get("base_url", "") | |||
| ) | |||
| return get_json_result(data=True) | |||
| @manager.route('/add_llm', methods=['POST']) | |||
| @login_required | |||
| @validate_request("llm_factory", "llm_name", "model_type") | |||
| def add_llm(): | |||
| req = request.json | |||
| factory = req["llm_factory"] | |||
| if factory == "VolcEngine": | |||
| # For VolcEngine, due to its special authentication method | |||
| # Assemble volc_ak, volc_sk, endpoint_id into api_key | |||
| temp = list(ast.literal_eval(req["llm_name"]).items())[0] | |||
| llm_name = temp[0] | |||
| endpoint_id = temp[1] | |||
| api_key = '{' + f'"volc_ak": "{req.get("volc_ak", "")}", ' \ | |||
| f'"volc_sk": "{req.get("volc_sk", "")}", ' \ | |||
| f'"ep_id": "{endpoint_id}", ' + '}' | |||
| elif factory == "Bedrock": | |||
| # For Bedrock, due to its special authentication method | |||
| # Assemble bedrock_ak, bedrock_sk, bedrock_region | |||
| llm_name = req["llm_name"] | |||
| api_key = '{' + f'"bedrock_ak": "{req.get("bedrock_ak", "")}", ' \ | |||
| f'"bedrock_sk": "{req.get("bedrock_sk", "")}", ' \ | |||
| f'"bedrock_region": "{req.get("bedrock_region", "")}", ' + '}' | |||
| elif factory == "LocalAI": | |||
| llm_name = req["llm_name"]+"___LocalAI" | |||
| api_key = "xxxxxxxxxxxxxxx" | |||
| elif factory == "OpenAI-API-Compatible": | |||
| llm_name = req["llm_name"]+"___OpenAI-API" | |||
| api_key = req.get("api_key","xxxxxxxxxxxxxxx") | |||
| else: | |||
| llm_name = req["llm_name"] | |||
| api_key = req.get("api_key","xxxxxxxxxxxxxxx") | |||
| llm = { | |||
| "tenant_id": current_user.id, | |||
| "llm_factory": factory, | |||
| "model_type": req["model_type"], | |||
| "llm_name": llm_name, | |||
| "api_base": req.get("api_base", ""), | |||
| "api_key": api_key | |||
| } | |||
| msg = "" | |||
| if llm["model_type"] == LLMType.EMBEDDING.value: | |||
| mdl = EmbeddingModel[factory]( | |||
| key=llm['api_key'] if factory in ["VolcEngine", "Bedrock","OpenAI-API-Compatible"] else None, | |||
| model_name=llm["llm_name"], | |||
| base_url=llm["api_base"]) | |||
| try: | |||
| arr, tc = mdl.encode(["Test if the api key is available"]) | |||
| if len(arr[0]) == 0 or tc == 0: | |||
| raise Exception("Fail") | |||
| except Exception as e: | |||
| msg += f"\nFail to access embedding model({llm['llm_name']})." + str(e) | |||
| elif llm["model_type"] == LLMType.CHAT.value: | |||
| mdl = ChatModel[factory]( | |||
| key=llm['api_key'] if factory in ["VolcEngine", "Bedrock","OpenAI-API-Compatible"] else None, | |||
| model_name=llm["llm_name"], | |||
| base_url=llm["api_base"] | |||
| ) | |||
| try: | |||
| m, tc = mdl.chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], { | |||
| "temperature": 0.9}) | |||
| if not tc: | |||
| raise Exception(m) | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm['llm_name']})." + str( | |||
| e) | |||
| elif llm["model_type"] == LLMType.RERANK: | |||
| mdl = RerankModel[factory]( | |||
| key=None, model_name=llm["llm_name"], base_url=llm["api_base"] | |||
| ) | |||
| try: | |||
| arr, tc = mdl.similarity("Hello~ Ragflower!", ["Hi, there!"]) | |||
| if len(arr) == 0 or tc == 0: | |||
| raise Exception("Not known.") | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm['llm_name']})." + str( | |||
| e) | |||
| elif llm["model_type"] == LLMType.IMAGE2TEXT.value: | |||
| mdl = CvModel[factory]( | |||
| key=llm["api_key"] if factory in ["OpenAI-API-Compatible"] else None, model_name=llm["llm_name"], base_url=llm["api_base"] | |||
| ) | |||
| try: | |||
| img_url = ( | |||
| "https://upload.wikimedia.org/wikipedia/comm" | |||
| "ons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/256" | |||
| "0px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" | |||
| ) | |||
| res = requests.get(img_url) | |||
| if res.status_code == 200: | |||
| m, tc = mdl.describe(res.content) | |||
| if not tc: | |||
| raise Exception(m) | |||
| else: | |||
| pass | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm['llm_name']})." + str(e) | |||
| else: | |||
| # TODO: check other type of models | |||
| pass | |||
| if msg: | |||
| return get_data_error_result(retmsg=msg) | |||
| if not TenantLLMService.filter_update( | |||
| [TenantLLM.tenant_id == current_user.id, TenantLLM.llm_factory == factory, TenantLLM.llm_name == llm["llm_name"]], llm): | |||
| TenantLLMService.save(**llm) | |||
| return get_json_result(data=True) | |||
| @manager.route('/delete_llm', methods=['POST']) | |||
| @login_required | |||
| @validate_request("llm_factory", "llm_name") | |||
| def delete_llm(): | |||
| req = request.json | |||
| TenantLLMService.filter_delete( | |||
| [TenantLLM.tenant_id == current_user.id, TenantLLM.llm_factory == req["llm_factory"], TenantLLM.llm_name == req["llm_name"]]) | |||
| return get_json_result(data=True) | |||
| @manager.route('/my_llms', methods=['GET']) | |||
| @login_required | |||
| def my_llms(): | |||
| try: | |||
| res = {} | |||
| for o in TenantLLMService.get_my_llms(current_user.id): | |||
| if o["llm_factory"] not in res: | |||
| res[o["llm_factory"]] = { | |||
| "tags": o["tags"], | |||
| "llm": [] | |||
| } | |||
| res[o["llm_factory"]]["llm"].append({ | |||
| "type": o["model_type"], | |||
| "name": o["llm_name"], | |||
| "used_token": o["used_tokens"] | |||
| }) | |||
| return get_json_result(data=res) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/list', methods=['GET']) | |||
| @login_required | |||
| def list_app(): | |||
| model_type = request.args.get("model_type") | |||
| try: | |||
| objs = TenantLLMService.query(tenant_id=current_user.id) | |||
| facts = set([o.to_dict()["llm_factory"] for o in objs if o.api_key]) | |||
| llms = LLMService.get_all() | |||
| llms = [m.to_dict() | |||
| for m in llms if m.status == StatusEnum.VALID.value] | |||
| for m in llms: | |||
| m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding" or m["fid"] in ["Youdao","FastEmbed", "BAAI"] | |||
| llm_set = set([m["llm_name"] for m in llms]) | |||
| for o in objs: | |||
| if not o.api_key:continue | |||
| if o.llm_name in llm_set:continue | |||
| llms.append({"llm_name": o.llm_name, "model_type": o.model_type, "fid": o.llm_factory, "available": True}) | |||
| res = {} | |||
| for m in llms: | |||
| if model_type and m["model_type"].find(model_type)<0: | |||
| continue | |||
| if m["fid"] not in res: | |||
| res[m["fid"]] = [] | |||
| res[m["fid"]].append(m) | |||
| return get_json_result(data=res) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from api.db.services.llm_service import LLMFactoriesService, TenantLLMService, LLMService | |||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | |||
| from api.db import StatusEnum, LLMType | |||
| from api.db.db_models import TenantLLM | |||
| from api.utils.api_utils import get_json_result | |||
| from rag.llm import EmbeddingModel, ChatModel, RerankModel,CvModel | |||
| import requests | |||
| import ast | |||
| @manager.route('/factories', methods=['GET']) | |||
| @login_required | |||
| def factories(): | |||
| try: | |||
| fac = LLMFactoriesService.get_all() | |||
| return get_json_result(data=[f.to_dict() for f in fac if f.name not in ["Youdao", "FastEmbed", "BAAI"]]) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/set_api_key', methods=['POST']) | |||
| @login_required | |||
| @validate_request("llm_factory", "api_key") | |||
| def set_api_key(): | |||
| req = request.json | |||
| # test if api key works | |||
| chat_passed, embd_passed, rerank_passed = False, False, False | |||
| factory = req["llm_factory"] | |||
| msg = "" | |||
| for llm in LLMService.query(fid=factory): | |||
| if not embd_passed and llm.model_type == LLMType.EMBEDDING.value: | |||
| mdl = EmbeddingModel[factory]( | |||
| req["api_key"], llm.llm_name, base_url=req.get("base_url")) | |||
| try: | |||
| arr, tc = mdl.encode(["Test if the api key is available"]) | |||
| if len(arr[0]) == 0: | |||
| raise Exception("Fail") | |||
| embd_passed = True | |||
| except Exception as e: | |||
| msg += f"\nFail to access embedding model({llm.llm_name}) using this api key." + str(e) | |||
| elif not chat_passed and llm.model_type == LLMType.CHAT.value: | |||
| mdl = ChatModel[factory]( | |||
| req["api_key"], llm.llm_name, base_url=req.get("base_url")) | |||
| try: | |||
| m, tc = mdl.chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], | |||
| {"temperature": 0.9,'max_tokens':50}) | |||
| if m.find("**ERROR**") >=0: | |||
| raise Exception(m) | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm.llm_name}) using this api key." + str( | |||
| e) | |||
| chat_passed = True | |||
| elif not rerank_passed and llm.model_type == LLMType.RERANK: | |||
| mdl = RerankModel[factory]( | |||
| req["api_key"], llm.llm_name, base_url=req.get("base_url")) | |||
| try: | |||
| arr, tc = mdl.similarity("What's the weather?", ["Is it sunny today?"]) | |||
| if len(arr) == 0 or tc == 0: | |||
| raise Exception("Fail") | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm.llm_name}) using this api key." + str( | |||
| e) | |||
| rerank_passed = True | |||
| if msg: | |||
| return get_data_error_result(retmsg=msg) | |||
| llm = { | |||
| "api_key": req["api_key"], | |||
| "api_base": req.get("base_url", "") | |||
| } | |||
| for n in ["model_type", "llm_name"]: | |||
| if n in req: | |||
| llm[n] = req[n] | |||
| if not TenantLLMService.filter_update( | |||
| [TenantLLM.tenant_id == current_user.id, TenantLLM.llm_factory == factory], llm): | |||
| for llm in LLMService.query(fid=factory): | |||
| TenantLLMService.save( | |||
| tenant_id=current_user.id, | |||
| llm_factory=factory, | |||
| llm_name=llm.llm_name, | |||
| model_type=llm.model_type, | |||
| api_key=req["api_key"], | |||
| api_base=req.get("base_url", "") | |||
| ) | |||
| return get_json_result(data=True) | |||
| @manager.route('/add_llm', methods=['POST']) | |||
| @login_required | |||
| @validate_request("llm_factory", "llm_name", "model_type") | |||
| def add_llm(): | |||
| req = request.json | |||
| factory = req["llm_factory"] | |||
| if factory == "VolcEngine": | |||
| # For VolcEngine, due to its special authentication method | |||
| # Assemble volc_ak, volc_sk, endpoint_id into api_key | |||
| temp = list(ast.literal_eval(req["llm_name"]).items())[0] | |||
| llm_name = temp[0] | |||
| endpoint_id = temp[1] | |||
| api_key = '{' + f'"volc_ak": "{req.get("volc_ak", "")}", ' \ | |||
| f'"volc_sk": "{req.get("volc_sk", "")}", ' \ | |||
| f'"ep_id": "{endpoint_id}", ' + '}' | |||
| elif factory == "Bedrock": | |||
| # For Bedrock, due to its special authentication method | |||
| # Assemble bedrock_ak, bedrock_sk, bedrock_region | |||
| llm_name = req["llm_name"] | |||
| api_key = '{' + f'"bedrock_ak": "{req.get("bedrock_ak", "")}", ' \ | |||
| f'"bedrock_sk": "{req.get("bedrock_sk", "")}", ' \ | |||
| f'"bedrock_region": "{req.get("bedrock_region", "")}", ' + '}' | |||
| elif factory == "LocalAI": | |||
| llm_name = req["llm_name"]+"___LocalAI" | |||
| api_key = "xxxxxxxxxxxxxxx" | |||
| elif factory == "OpenAI-API-Compatible": | |||
| llm_name = req["llm_name"]+"___OpenAI-API" | |||
| api_key = req.get("api_key","xxxxxxxxxxxxxxx") | |||
| else: | |||
| llm_name = req["llm_name"] | |||
| api_key = req.get("api_key","xxxxxxxxxxxxxxx") | |||
| llm = { | |||
| "tenant_id": current_user.id, | |||
| "llm_factory": factory, | |||
| "model_type": req["model_type"], | |||
| "llm_name": llm_name, | |||
| "api_base": req.get("api_base", ""), | |||
| "api_key": api_key | |||
| } | |||
| msg = "" | |||
| if llm["model_type"] == LLMType.EMBEDDING.value: | |||
| mdl = EmbeddingModel[factory]( | |||
| key=llm['api_key'] if factory in ["VolcEngine", "Bedrock","OpenAI-API-Compatible"] else None, | |||
| model_name=llm["llm_name"], | |||
| base_url=llm["api_base"]) | |||
| try: | |||
| arr, tc = mdl.encode(["Test if the api key is available"]) | |||
| if len(arr[0]) == 0 or tc == 0: | |||
| raise Exception("Fail") | |||
| except Exception as e: | |||
| msg += f"\nFail to access embedding model({llm['llm_name']})." + str(e) | |||
| elif llm["model_type"] == LLMType.CHAT.value: | |||
| mdl = ChatModel[factory]( | |||
| key=llm['api_key'] if factory in ["VolcEngine", "Bedrock","OpenAI-API-Compatible"] else None, | |||
| model_name=llm["llm_name"], | |||
| base_url=llm["api_base"] | |||
| ) | |||
| try: | |||
| m, tc = mdl.chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], { | |||
| "temperature": 0.9}) | |||
| if not tc: | |||
| raise Exception(m) | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm['llm_name']})." + str( | |||
| e) | |||
| elif llm["model_type"] == LLMType.RERANK: | |||
| mdl = RerankModel[factory]( | |||
| key=None, model_name=llm["llm_name"], base_url=llm["api_base"] | |||
| ) | |||
| try: | |||
| arr, tc = mdl.similarity("Hello~ Ragflower!", ["Hi, there!"]) | |||
| if len(arr) == 0 or tc == 0: | |||
| raise Exception("Not known.") | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm['llm_name']})." + str( | |||
| e) | |||
| elif llm["model_type"] == LLMType.IMAGE2TEXT.value: | |||
| mdl = CvModel[factory]( | |||
| key=llm["api_key"] if factory in ["OpenAI-API-Compatible"] else None, model_name=llm["llm_name"], base_url=llm["api_base"] | |||
| ) | |||
| try: | |||
| img_url = ( | |||
| "https://upload.wikimedia.org/wikipedia/comm" | |||
| "ons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/256" | |||
| "0px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" | |||
| ) | |||
| res = requests.get(img_url) | |||
| if res.status_code == 200: | |||
| m, tc = mdl.describe(res.content) | |||
| if not tc: | |||
| raise Exception(m) | |||
| else: | |||
| pass | |||
| except Exception as e: | |||
| msg += f"\nFail to access model({llm['llm_name']})." + str(e) | |||
| else: | |||
| # TODO: check other type of models | |||
| pass | |||
| if msg: | |||
| return get_data_error_result(retmsg=msg) | |||
| if not TenantLLMService.filter_update( | |||
| [TenantLLM.tenant_id == current_user.id, TenantLLM.llm_factory == factory, TenantLLM.llm_name == llm["llm_name"]], llm): | |||
| TenantLLMService.save(**llm) | |||
| return get_json_result(data=True) | |||
| @manager.route('/delete_llm', methods=['POST']) | |||
| @login_required | |||
| @validate_request("llm_factory", "llm_name") | |||
| def delete_llm(): | |||
| req = request.json | |||
| TenantLLMService.filter_delete( | |||
| [TenantLLM.tenant_id == current_user.id, TenantLLM.llm_factory == req["llm_factory"], TenantLLM.llm_name == req["llm_name"]]) | |||
| return get_json_result(data=True) | |||
| @manager.route('/my_llms', methods=['GET']) | |||
| @login_required | |||
| def my_llms(): | |||
| try: | |||
| res = {} | |||
| for o in TenantLLMService.get_my_llms(current_user.id): | |||
| if o["llm_factory"] not in res: | |||
| res[o["llm_factory"]] = { | |||
| "tags": o["tags"], | |||
| "llm": [] | |||
| } | |||
| res[o["llm_factory"]]["llm"].append({ | |||
| "type": o["model_type"], | |||
| "name": o["llm_name"], | |||
| "used_token": o["used_tokens"] | |||
| }) | |||
| return get_json_result(data=res) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route('/list', methods=['GET']) | |||
| @login_required | |||
| def list_app(): | |||
| model_type = request.args.get("model_type") | |||
| try: | |||
| objs = TenantLLMService.query(tenant_id=current_user.id) | |||
| facts = set([o.to_dict()["llm_factory"] for o in objs if o.api_key]) | |||
| llms = LLMService.get_all() | |||
| llms = [m.to_dict() | |||
| for m in llms if m.status == StatusEnum.VALID.value] | |||
| for m in llms: | |||
| m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding" or m["fid"] in ["Youdao","FastEmbed", "BAAI"] | |||
| llm_set = set([m["llm_name"] for m in llms]) | |||
| for o in objs: | |||
| if not o.api_key:continue | |||
| if o.llm_name in llm_set:continue | |||
| llms.append({"llm_name": o.llm_name, "model_type": o.model_type, "fid": o.llm_factory, "available": True}) | |||
| res = {} | |||
| for m in llms: | |||
| if model_type and m["model_type"].find(model_type)<0: | |||
| continue | |||
| if m["fid"] not in res: | |||
| res[m["fid"]] = [] | |||
| res[m["fid"]].append(m) | |||
| return get_json_result(data=res) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @@ -1,391 +1,391 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| import re | |||
| from datetime import datetime | |||
| from flask import request, session, redirect | |||
| from werkzeug.security import generate_password_hash, check_password_hash | |||
| from flask_login import login_required, current_user, login_user, logout_user | |||
| from api.db.db_models import TenantLLM | |||
| from api.db.services.llm_service import TenantLLMService, LLMService | |||
| from api.utils.api_utils import server_error_response, validate_request | |||
| from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format | |||
| from api.db import UserTenantRole, LLMType, FileType | |||
| from api.settings import RetCode, GITHUB_OAUTH, FEISHU_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, \ | |||
| API_KEY, \ | |||
| LLM_FACTORY, LLM_BASE_URL, RERANK_MDL | |||
| from api.db.services.user_service import UserService, TenantService, UserTenantService | |||
| from api.db.services.file_service import FileService | |||
| from api.settings import stat_logger | |||
| from api.utils.api_utils import get_json_result, cors_reponse | |||
| @manager.route('/login', methods=['POST', 'GET']) | |||
| def login(): | |||
| login_channel = "password" | |||
| if not request.json: | |||
| return get_json_result(data=False, retcode=RetCode.AUTHENTICATION_ERROR, | |||
| retmsg='Unautherized!') | |||
| email = request.json.get('email', "") | |||
| users = UserService.query(email=email) | |||
| if not users: | |||
| return get_json_result( | |||
| data=False, retcode=RetCode.AUTHENTICATION_ERROR, retmsg=f'This Email is not registered!') | |||
| password = request.json.get('password') | |||
| try: | |||
| password = decrypt(password) | |||
| except BaseException: | |||
| return get_json_result( | |||
| data=False, retcode=RetCode.SERVER_ERROR, retmsg='Fail to crypt password') | |||
| user = UserService.query_user(email, password) | |||
| if user: | |||
| response_data = user.to_json() | |||
| user.access_token = get_uuid() | |||
| login_user(user) | |||
| user.update_time = current_timestamp(), | |||
| user.update_date = datetime_format(datetime.now()), | |||
| user.save() | |||
| msg = "Welcome back!" | |||
| return cors_reponse(data=response_data, auth=user.get_id(), retmsg=msg) | |||
| else: | |||
| return get_json_result(data=False, retcode=RetCode.AUTHENTICATION_ERROR, | |||
| retmsg='Email and Password do not match!') | |||
| @manager.route('/github_callback', methods=['GET']) | |||
| def github_callback(): | |||
| import requests | |||
| res = requests.post(GITHUB_OAUTH.get("url"), data={ | |||
| "client_id": GITHUB_OAUTH.get("client_id"), | |||
| "client_secret": GITHUB_OAUTH.get("secret_key"), | |||
| "code": request.args.get('code') | |||
| }, headers={"Accept": "application/json"}) | |||
| res = res.json() | |||
| if "error" in res: | |||
| return redirect("/?error=%s" % res["error_description"]) | |||
| if "user:email" not in res["scope"].split(","): | |||
| return redirect("/?error=user:email not in scope") | |||
| session["access_token"] = res["access_token"] | |||
| session["access_token_from"] = "github" | |||
| userinfo = user_info_from_github(session["access_token"]) | |||
| users = UserService.query(email=userinfo["email"]) | |||
| user_id = get_uuid() | |||
| if not users: | |||
| try: | |||
| try: | |||
| avatar = download_img(userinfo["avatar_url"]) | |||
| except Exception as e: | |||
| stat_logger.exception(e) | |||
| avatar = "" | |||
| users = user_register(user_id, { | |||
| "access_token": session["access_token"], | |||
| "email": userinfo["email"], | |||
| "avatar": avatar, | |||
| "nickname": userinfo["login"], | |||
| "login_channel": "github", | |||
| "last_login_time": get_format_time(), | |||
| "is_superuser": False, | |||
| }) | |||
| if not users: | |||
| raise Exception('Register user failure.') | |||
| if len(users) > 1: | |||
| raise Exception('Same E-mail exist!') | |||
| user = users[0] | |||
| login_user(user) | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| except Exception as e: | |||
| rollback_user_registration(user_id) | |||
| stat_logger.exception(e) | |||
| return redirect("/?error=%s" % str(e)) | |||
| user = users[0] | |||
| user.access_token = get_uuid() | |||
| login_user(user) | |||
| user.save() | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| @manager.route('/feishu_callback', methods=['GET']) | |||
| def feishu_callback(): | |||
| import requests | |||
| app_access_token_res = requests.post(FEISHU_OAUTH.get("app_access_token_url"), data=json.dumps({ | |||
| "app_id": FEISHU_OAUTH.get("app_id"), | |||
| "app_secret": FEISHU_OAUTH.get("app_secret") | |||
| }), headers={"Content-Type": "application/json; charset=utf-8"}) | |||
| app_access_token_res = app_access_token_res.json() | |||
| if app_access_token_res['code'] != 0: | |||
| return redirect("/?error=%s" % app_access_token_res) | |||
| res = requests.post(FEISHU_OAUTH.get("user_access_token_url"), data=json.dumps({ | |||
| "grant_type": FEISHU_OAUTH.get("grant_type"), | |||
| "code": request.args.get('code') | |||
| }), headers={"Content-Type": "application/json; charset=utf-8", | |||
| 'Authorization': f"Bearer {app_access_token_res['app_access_token']}"}) | |||
| res = res.json() | |||
| if res['code'] != 0: | |||
| return redirect("/?error=%s" % res["message"]) | |||
| if "contact:user.email:readonly" not in res["data"]["scope"].split(" "): | |||
| return redirect("/?error=contact:user.email:readonly not in scope") | |||
| session["access_token"] = res["data"]["access_token"] | |||
| session["access_token_from"] = "feishu" | |||
| userinfo = user_info_from_feishu(session["access_token"]) | |||
| users = UserService.query(email=userinfo["email"]) | |||
| user_id = get_uuid() | |||
| if not users: | |||
| try: | |||
| try: | |||
| avatar = download_img(userinfo["avatar_url"]) | |||
| except Exception as e: | |||
| stat_logger.exception(e) | |||
| avatar = "" | |||
| users = user_register(user_id, { | |||
| "access_token": session["access_token"], | |||
| "email": userinfo["email"], | |||
| "avatar": avatar, | |||
| "nickname": userinfo["en_name"], | |||
| "login_channel": "feishu", | |||
| "last_login_time": get_format_time(), | |||
| "is_superuser": False, | |||
| }) | |||
| if not users: | |||
| raise Exception('Register user failure.') | |||
| if len(users) > 1: | |||
| raise Exception('Same E-mail exist!') | |||
| user = users[0] | |||
| login_user(user) | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| except Exception as e: | |||
| rollback_user_registration(user_id) | |||
| stat_logger.exception(e) | |||
| return redirect("/?error=%s" % str(e)) | |||
| user = users[0] | |||
| user.access_token = get_uuid() | |||
| login_user(user) | |||
| user.save() | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| def user_info_from_feishu(access_token): | |||
| import requests | |||
| headers = {"Content-Type": "application/json; charset=utf-8", | |||
| 'Authorization': f"Bearer {access_token}"} | |||
| res = requests.get( | |||
| f"https://open.feishu.cn/open-apis/authen/v1/user_info", | |||
| headers=headers) | |||
| user_info = res.json()["data"] | |||
| user_info["email"] = None if user_info.get("email") == "" else user_info["email"] | |||
| return user_info | |||
| def user_info_from_github(access_token): | |||
| import requests | |||
| headers = {"Accept": "application/json", | |||
| 'Authorization': f"token {access_token}"} | |||
| res = requests.get( | |||
| f"https://api.github.com/user?access_token={access_token}", | |||
| headers=headers) | |||
| user_info = res.json() | |||
| email_info = requests.get( | |||
| f"https://api.github.com/user/emails?access_token={access_token}", | |||
| headers=headers).json() | |||
| user_info["email"] = next( | |||
| (email for email in email_info if email['primary'] == True), | |||
| None)["email"] | |||
| return user_info | |||
| @manager.route("/logout", methods=['GET']) | |||
| @login_required | |||
| def log_out(): | |||
| current_user.access_token = "" | |||
| current_user.save() | |||
| logout_user() | |||
| return get_json_result(data=True) | |||
| @manager.route("/setting", methods=["POST"]) | |||
| @login_required | |||
| def setting_user(): | |||
| update_dict = {} | |||
| request_data = request.json | |||
| if request_data.get("password"): | |||
| new_password = request_data.get("new_password") | |||
| if not check_password_hash( | |||
| current_user.password, decrypt(request_data["password"])): | |||
| return get_json_result( | |||
| data=False, retcode=RetCode.AUTHENTICATION_ERROR, retmsg='Password error!') | |||
| if new_password: | |||
| update_dict["password"] = generate_password_hash( | |||
| decrypt(new_password)) | |||
| for k in request_data.keys(): | |||
| if k in ["password", "new_password"]: | |||
| continue | |||
| update_dict[k] = request_data[k] | |||
| try: | |||
| UserService.update_by_id(current_user.id, update_dict) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| stat_logger.exception(e) | |||
| return get_json_result( | |||
| data=False, retmsg='Update failure!', retcode=RetCode.EXCEPTION_ERROR) | |||
| @manager.route("/info", methods=["GET"]) | |||
| @login_required | |||
| def user_info(): | |||
| return get_json_result(data=current_user.to_dict()) | |||
| def rollback_user_registration(user_id): | |||
| try: | |||
| UserService.delete_by_id(user_id) | |||
| except Exception as e: | |||
| pass | |||
| try: | |||
| TenantService.delete_by_id(user_id) | |||
| except Exception as e: | |||
| pass | |||
| try: | |||
| u = UserTenantService.query(tenant_id=user_id) | |||
| if u: | |||
| UserTenantService.delete_by_id(u[0].id) | |||
| except Exception as e: | |||
| pass | |||
| try: | |||
| TenantLLM.delete().where(TenantLLM.tenant_id == user_id).execute() | |||
| except Exception as e: | |||
| pass | |||
| def user_register(user_id, user): | |||
| user["id"] = user_id | |||
| tenant = { | |||
| "id": user_id, | |||
| "name": user["nickname"] + "‘s Kingdom", | |||
| "llm_id": CHAT_MDL, | |||
| "embd_id": EMBEDDING_MDL, | |||
| "asr_id": ASR_MDL, | |||
| "parser_ids": PARSERS, | |||
| "img2txt_id": IMAGE2TEXT_MDL, | |||
| "rerank_id": RERANK_MDL | |||
| } | |||
| usr_tenant = { | |||
| "tenant_id": user_id, | |||
| "user_id": user_id, | |||
| "invited_by": user_id, | |||
| "role": UserTenantRole.OWNER | |||
| } | |||
| file_id = get_uuid() | |||
| file = { | |||
| "id": file_id, | |||
| "parent_id": file_id, | |||
| "tenant_id": user_id, | |||
| "created_by": user_id, | |||
| "name": "/", | |||
| "type": FileType.FOLDER.value, | |||
| "size": 0, | |||
| "location": "", | |||
| } | |||
| tenant_llm = [] | |||
| for llm in LLMService.query(fid=LLM_FACTORY): | |||
| tenant_llm.append({"tenant_id": user_id, | |||
| "llm_factory": LLM_FACTORY, | |||
| "llm_name": llm.llm_name, | |||
| "model_type": llm.model_type, | |||
| "api_key": API_KEY, | |||
| "api_base": LLM_BASE_URL | |||
| }) | |||
| if not UserService.save(**user): | |||
| return | |||
| TenantService.insert(**tenant) | |||
| UserTenantService.insert(**usr_tenant) | |||
| TenantLLMService.insert_many(tenant_llm) | |||
| FileService.insert(file) | |||
| return UserService.query(email=user["email"]) | |||
| @manager.route("/register", methods=["POST"]) | |||
| @validate_request("nickname", "email", "password") | |||
| def user_add(): | |||
| req = request.json | |||
| if UserService.query(email=req["email"]): | |||
| return get_json_result( | |||
| data=False, retmsg=f'Email: {req["email"]} has already registered!', retcode=RetCode.OPERATING_ERROR) | |||
| if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,4}$", req["email"]): | |||
| return get_json_result(data=False, retmsg=f'Invaliad e-mail: {req["email"]}!', | |||
| retcode=RetCode.OPERATING_ERROR) | |||
| user_dict = { | |||
| "access_token": get_uuid(), | |||
| "email": req["email"], | |||
| "nickname": req["nickname"], | |||
| "password": decrypt(req["password"]), | |||
| "login_channel": "password", | |||
| "last_login_time": get_format_time(), | |||
| "is_superuser": False, | |||
| } | |||
| user_id = get_uuid() | |||
| try: | |||
| users = user_register(user_id, user_dict) | |||
| if not users: | |||
| raise Exception('Register user failure.') | |||
| if len(users) > 1: | |||
| raise Exception('Same E-mail exist!') | |||
| user = users[0] | |||
| login_user(user) | |||
| return cors_reponse(data=user.to_json(), | |||
| auth=user.get_id(), retmsg="Welcome aboard!") | |||
| except Exception as e: | |||
| rollback_user_registration(user_id) | |||
| stat_logger.exception(e) | |||
| return get_json_result( | |||
| data=False, retmsg='User registration failure!', retcode=RetCode.EXCEPTION_ERROR) | |||
| @manager.route("/tenant_info", methods=["GET"]) | |||
| @login_required | |||
| def tenant_info(): | |||
| try: | |||
| tenants = TenantService.get_by_user_id(current_user.id)[0] | |||
| return get_json_result(data=tenants) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route("/set_tenant_info", methods=["POST"]) | |||
| @login_required | |||
| @validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id") | |||
| def set_tenant_info(): | |||
| req = request.json | |||
| try: | |||
| tid = req["tenant_id"] | |||
| del req["tenant_id"] | |||
| TenantService.update_by_id(tid, req) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| import re | |||
| from datetime import datetime | |||
| from flask import request, session, redirect | |||
| from werkzeug.security import generate_password_hash, check_password_hash | |||
| from flask_login import login_required, current_user, login_user, logout_user | |||
| from api.db.db_models import TenantLLM | |||
| from api.db.services.llm_service import TenantLLMService, LLMService | |||
| from api.utils.api_utils import server_error_response, validate_request | |||
| from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format | |||
| from api.db import UserTenantRole, LLMType, FileType | |||
| from api.settings import RetCode, GITHUB_OAUTH, FEISHU_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, \ | |||
| API_KEY, \ | |||
| LLM_FACTORY, LLM_BASE_URL, RERANK_MDL | |||
| from api.db.services.user_service import UserService, TenantService, UserTenantService | |||
| from api.db.services.file_service import FileService | |||
| from api.settings import stat_logger | |||
| from api.utils.api_utils import get_json_result, cors_reponse | |||
| @manager.route('/login', methods=['POST', 'GET']) | |||
| def login(): | |||
| login_channel = "password" | |||
| if not request.json: | |||
| return get_json_result(data=False, retcode=RetCode.AUTHENTICATION_ERROR, | |||
| retmsg='Unautherized!') | |||
| email = request.json.get('email', "") | |||
| users = UserService.query(email=email) | |||
| if not users: | |||
| return get_json_result( | |||
| data=False, retcode=RetCode.AUTHENTICATION_ERROR, retmsg=f'This Email is not registered!') | |||
| password = request.json.get('password') | |||
| try: | |||
| password = decrypt(password) | |||
| except BaseException: | |||
| return get_json_result( | |||
| data=False, retcode=RetCode.SERVER_ERROR, retmsg='Fail to crypt password') | |||
| user = UserService.query_user(email, password) | |||
| if user: | |||
| response_data = user.to_json() | |||
| user.access_token = get_uuid() | |||
| login_user(user) | |||
| user.update_time = current_timestamp(), | |||
| user.update_date = datetime_format(datetime.now()), | |||
| user.save() | |||
| msg = "Welcome back!" | |||
| return cors_reponse(data=response_data, auth=user.get_id(), retmsg=msg) | |||
| else: | |||
| return get_json_result(data=False, retcode=RetCode.AUTHENTICATION_ERROR, | |||
| retmsg='Email and Password do not match!') | |||
| @manager.route('/github_callback', methods=['GET']) | |||
| def github_callback(): | |||
| import requests | |||
| res = requests.post(GITHUB_OAUTH.get("url"), data={ | |||
| "client_id": GITHUB_OAUTH.get("client_id"), | |||
| "client_secret": GITHUB_OAUTH.get("secret_key"), | |||
| "code": request.args.get('code') | |||
| }, headers={"Accept": "application/json"}) | |||
| res = res.json() | |||
| if "error" in res: | |||
| return redirect("/?error=%s" % res["error_description"]) | |||
| if "user:email" not in res["scope"].split(","): | |||
| return redirect("/?error=user:email not in scope") | |||
| session["access_token"] = res["access_token"] | |||
| session["access_token_from"] = "github" | |||
| userinfo = user_info_from_github(session["access_token"]) | |||
| users = UserService.query(email=userinfo["email"]) | |||
| user_id = get_uuid() | |||
| if not users: | |||
| try: | |||
| try: | |||
| avatar = download_img(userinfo["avatar_url"]) | |||
| except Exception as e: | |||
| stat_logger.exception(e) | |||
| avatar = "" | |||
| users = user_register(user_id, { | |||
| "access_token": session["access_token"], | |||
| "email": userinfo["email"], | |||
| "avatar": avatar, | |||
| "nickname": userinfo["login"], | |||
| "login_channel": "github", | |||
| "last_login_time": get_format_time(), | |||
| "is_superuser": False, | |||
| }) | |||
| if not users: | |||
| raise Exception('Register user failure.') | |||
| if len(users) > 1: | |||
| raise Exception('Same E-mail exist!') | |||
| user = users[0] | |||
| login_user(user) | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| except Exception as e: | |||
| rollback_user_registration(user_id) | |||
| stat_logger.exception(e) | |||
| return redirect("/?error=%s" % str(e)) | |||
| user = users[0] | |||
| user.access_token = get_uuid() | |||
| login_user(user) | |||
| user.save() | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| @manager.route('/feishu_callback', methods=['GET']) | |||
| def feishu_callback(): | |||
| import requests | |||
| app_access_token_res = requests.post(FEISHU_OAUTH.get("app_access_token_url"), data=json.dumps({ | |||
| "app_id": FEISHU_OAUTH.get("app_id"), | |||
| "app_secret": FEISHU_OAUTH.get("app_secret") | |||
| }), headers={"Content-Type": "application/json; charset=utf-8"}) | |||
| app_access_token_res = app_access_token_res.json() | |||
| if app_access_token_res['code'] != 0: | |||
| return redirect("/?error=%s" % app_access_token_res) | |||
| res = requests.post(FEISHU_OAUTH.get("user_access_token_url"), data=json.dumps({ | |||
| "grant_type": FEISHU_OAUTH.get("grant_type"), | |||
| "code": request.args.get('code') | |||
| }), headers={"Content-Type": "application/json; charset=utf-8", | |||
| 'Authorization': f"Bearer {app_access_token_res['app_access_token']}"}) | |||
| res = res.json() | |||
| if res['code'] != 0: | |||
| return redirect("/?error=%s" % res["message"]) | |||
| if "contact:user.email:readonly" not in res["data"]["scope"].split(" "): | |||
| return redirect("/?error=contact:user.email:readonly not in scope") | |||
| session["access_token"] = res["data"]["access_token"] | |||
| session["access_token_from"] = "feishu" | |||
| userinfo = user_info_from_feishu(session["access_token"]) | |||
| users = UserService.query(email=userinfo["email"]) | |||
| user_id = get_uuid() | |||
| if not users: | |||
| try: | |||
| try: | |||
| avatar = download_img(userinfo["avatar_url"]) | |||
| except Exception as e: | |||
| stat_logger.exception(e) | |||
| avatar = "" | |||
| users = user_register(user_id, { | |||
| "access_token": session["access_token"], | |||
| "email": userinfo["email"], | |||
| "avatar": avatar, | |||
| "nickname": userinfo["en_name"], | |||
| "login_channel": "feishu", | |||
| "last_login_time": get_format_time(), | |||
| "is_superuser": False, | |||
| }) | |||
| if not users: | |||
| raise Exception('Register user failure.') | |||
| if len(users) > 1: | |||
| raise Exception('Same E-mail exist!') | |||
| user = users[0] | |||
| login_user(user) | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| except Exception as e: | |||
| rollback_user_registration(user_id) | |||
| stat_logger.exception(e) | |||
| return redirect("/?error=%s" % str(e)) | |||
| user = users[0] | |||
| user.access_token = get_uuid() | |||
| login_user(user) | |||
| user.save() | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| def user_info_from_feishu(access_token): | |||
| import requests | |||
| headers = {"Content-Type": "application/json; charset=utf-8", | |||
| 'Authorization': f"Bearer {access_token}"} | |||
| res = requests.get( | |||
| f"https://open.feishu.cn/open-apis/authen/v1/user_info", | |||
| headers=headers) | |||
| user_info = res.json()["data"] | |||
| user_info["email"] = None if user_info.get("email") == "" else user_info["email"] | |||
| return user_info | |||
| def user_info_from_github(access_token): | |||
| import requests | |||
| headers = {"Accept": "application/json", | |||
| 'Authorization': f"token {access_token}"} | |||
| res = requests.get( | |||
| f"https://api.github.com/user?access_token={access_token}", | |||
| headers=headers) | |||
| user_info = res.json() | |||
| email_info = requests.get( | |||
| f"https://api.github.com/user/emails?access_token={access_token}", | |||
| headers=headers).json() | |||
| user_info["email"] = next( | |||
| (email for email in email_info if email['primary'] == True), | |||
| None)["email"] | |||
| return user_info | |||
| @manager.route("/logout", methods=['GET']) | |||
| @login_required | |||
| def log_out(): | |||
| current_user.access_token = "" | |||
| current_user.save() | |||
| logout_user() | |||
| return get_json_result(data=True) | |||
| @manager.route("/setting", methods=["POST"]) | |||
| @login_required | |||
| def setting_user(): | |||
| update_dict = {} | |||
| request_data = request.json | |||
| if request_data.get("password"): | |||
| new_password = request_data.get("new_password") | |||
| if not check_password_hash( | |||
| current_user.password, decrypt(request_data["password"])): | |||
| return get_json_result( | |||
| data=False, retcode=RetCode.AUTHENTICATION_ERROR, retmsg='Password error!') | |||
| if new_password: | |||
| update_dict["password"] = generate_password_hash( | |||
| decrypt(new_password)) | |||
| for k in request_data.keys(): | |||
| if k in ["password", "new_password"]: | |||
| continue | |||
| update_dict[k] = request_data[k] | |||
| try: | |||
| UserService.update_by_id(current_user.id, update_dict) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| stat_logger.exception(e) | |||
| return get_json_result( | |||
| data=False, retmsg='Update failure!', retcode=RetCode.EXCEPTION_ERROR) | |||
| @manager.route("/info", methods=["GET"]) | |||
| @login_required | |||
| def user_info(): | |||
| return get_json_result(data=current_user.to_dict()) | |||
| def rollback_user_registration(user_id): | |||
| try: | |||
| UserService.delete_by_id(user_id) | |||
| except Exception as e: | |||
| pass | |||
| try: | |||
| TenantService.delete_by_id(user_id) | |||
| except Exception as e: | |||
| pass | |||
| try: | |||
| u = UserTenantService.query(tenant_id=user_id) | |||
| if u: | |||
| UserTenantService.delete_by_id(u[0].id) | |||
| except Exception as e: | |||
| pass | |||
| try: | |||
| TenantLLM.delete().where(TenantLLM.tenant_id == user_id).execute() | |||
| except Exception as e: | |||
| pass | |||
| def user_register(user_id, user): | |||
| user["id"] = user_id | |||
| tenant = { | |||
| "id": user_id, | |||
| "name": user["nickname"] + "‘s Kingdom", | |||
| "llm_id": CHAT_MDL, | |||
| "embd_id": EMBEDDING_MDL, | |||
| "asr_id": ASR_MDL, | |||
| "parser_ids": PARSERS, | |||
| "img2txt_id": IMAGE2TEXT_MDL, | |||
| "rerank_id": RERANK_MDL | |||
| } | |||
| usr_tenant = { | |||
| "tenant_id": user_id, | |||
| "user_id": user_id, | |||
| "invited_by": user_id, | |||
| "role": UserTenantRole.OWNER | |||
| } | |||
| file_id = get_uuid() | |||
| file = { | |||
| "id": file_id, | |||
| "parent_id": file_id, | |||
| "tenant_id": user_id, | |||
| "created_by": user_id, | |||
| "name": "/", | |||
| "type": FileType.FOLDER.value, | |||
| "size": 0, | |||
| "location": "", | |||
| } | |||
| tenant_llm = [] | |||
| for llm in LLMService.query(fid=LLM_FACTORY): | |||
| tenant_llm.append({"tenant_id": user_id, | |||
| "llm_factory": LLM_FACTORY, | |||
| "llm_name": llm.llm_name, | |||
| "model_type": llm.model_type, | |||
| "api_key": API_KEY, | |||
| "api_base": LLM_BASE_URL | |||
| }) | |||
| if not UserService.save(**user): | |||
| return | |||
| TenantService.insert(**tenant) | |||
| UserTenantService.insert(**usr_tenant) | |||
| TenantLLMService.insert_many(tenant_llm) | |||
| FileService.insert(file) | |||
| return UserService.query(email=user["email"]) | |||
| @manager.route("/register", methods=["POST"]) | |||
| @validate_request("nickname", "email", "password") | |||
| def user_add(): | |||
| req = request.json | |||
| if UserService.query(email=req["email"]): | |||
| return get_json_result( | |||
| data=False, retmsg=f'Email: {req["email"]} has already registered!', retcode=RetCode.OPERATING_ERROR) | |||
| if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,4}$", req["email"]): | |||
| return get_json_result(data=False, retmsg=f'Invaliad e-mail: {req["email"]}!', | |||
| retcode=RetCode.OPERATING_ERROR) | |||
| user_dict = { | |||
| "access_token": get_uuid(), | |||
| "email": req["email"], | |||
| "nickname": req["nickname"], | |||
| "password": decrypt(req["password"]), | |||
| "login_channel": "password", | |||
| "last_login_time": get_format_time(), | |||
| "is_superuser": False, | |||
| } | |||
| user_id = get_uuid() | |||
| try: | |||
| users = user_register(user_id, user_dict) | |||
| if not users: | |||
| raise Exception('Register user failure.') | |||
| if len(users) > 1: | |||
| raise Exception('Same E-mail exist!') | |||
| user = users[0] | |||
| login_user(user) | |||
| return cors_reponse(data=user.to_json(), | |||
| auth=user.get_id(), retmsg="Welcome aboard!") | |||
| except Exception as e: | |||
| rollback_user_registration(user_id) | |||
| stat_logger.exception(e) | |||
| return get_json_result( | |||
| data=False, retmsg='User registration failure!', retcode=RetCode.EXCEPTION_ERROR) | |||
| @manager.route("/tenant_info", methods=["GET"]) | |||
| @login_required | |||
| def tenant_info(): | |||
| try: | |||
| tenants = TenantService.get_by_user_id(current_user.id)[0] | |||
| return get_json_result(data=tenants) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @manager.route("/set_tenant_info", methods=["POST"]) | |||
| @login_required | |||
| @validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id") | |||
| def set_tenant_info(): | |||
| req = request.json | |||
| try: | |||
| tid = req["tenant_id"] | |||
| del req["tenant_id"] | |||
| TenantService.update_by_id(tid, req) | |||
| return get_json_result(data=True) | |||
| except Exception as e: | |||
| return server_error_response(e) | |||
| @@ -1,102 +1,102 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from enum import Enum | |||
| from enum import IntEnum | |||
| from strenum import StrEnum | |||
| class StatusEnum(Enum): | |||
| VALID = "1" | |||
| INVALID = "0" | |||
| class UserTenantRole(StrEnum): | |||
| OWNER = 'owner' | |||
| ADMIN = 'admin' | |||
| NORMAL = 'normal' | |||
| class TenantPermission(StrEnum): | |||
| ME = 'me' | |||
| TEAM = 'team' | |||
| class SerializedType(IntEnum): | |||
| PICKLE = 1 | |||
| JSON = 2 | |||
| class FileType(StrEnum): | |||
| PDF = 'pdf' | |||
| DOC = 'doc' | |||
| VISUAL = 'visual' | |||
| AURAL = 'aural' | |||
| VIRTUAL = 'virtual' | |||
| FOLDER = 'folder' | |||
| OTHER = "other" | |||
| class LLMType(StrEnum): | |||
| CHAT = 'chat' | |||
| EMBEDDING = 'embedding' | |||
| SPEECH2TEXT = 'speech2text' | |||
| IMAGE2TEXT = 'image2text' | |||
| RERANK = 'rerank' | |||
| class ChatStyle(StrEnum): | |||
| CREATIVE = 'Creative' | |||
| PRECISE = 'Precise' | |||
| EVENLY = 'Evenly' | |||
| CUSTOM = 'Custom' | |||
| class TaskStatus(StrEnum): | |||
| UNSTART = "0" | |||
| RUNNING = "1" | |||
| CANCEL = "2" | |||
| DONE = "3" | |||
| FAIL = "4" | |||
| class ParserType(StrEnum): | |||
| PRESENTATION = "presentation" | |||
| LAWS = "laws" | |||
| MANUAL = "manual" | |||
| PAPER = "paper" | |||
| RESUME = "resume" | |||
| BOOK = "book" | |||
| QA = "qa" | |||
| TABLE = "table" | |||
| NAIVE = "naive" | |||
| PICTURE = "picture" | |||
| ONE = "one" | |||
| AUDIO = "audio" | |||
| EMAIL = "email" | |||
| KG = "knowledge_graph" | |||
| class FileSource(StrEnum): | |||
| LOCAL = "" | |||
| KNOWLEDGEBASE = "knowledgebase" | |||
| S3 = "s3" | |||
| class CanvasType(StrEnum): | |||
| ChatBot = "chatbot" | |||
| DocBot = "docbot" | |||
| KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase" | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from enum import Enum | |||
| from enum import IntEnum | |||
| from strenum import StrEnum | |||
| class StatusEnum(Enum): | |||
| VALID = "1" | |||
| INVALID = "0" | |||
| class UserTenantRole(StrEnum): | |||
| OWNER = 'owner' | |||
| ADMIN = 'admin' | |||
| NORMAL = 'normal' | |||
| class TenantPermission(StrEnum): | |||
| ME = 'me' | |||
| TEAM = 'team' | |||
| class SerializedType(IntEnum): | |||
| PICKLE = 1 | |||
| JSON = 2 | |||
| class FileType(StrEnum): | |||
| PDF = 'pdf' | |||
| DOC = 'doc' | |||
| VISUAL = 'visual' | |||
| AURAL = 'aural' | |||
| VIRTUAL = 'virtual' | |||
| FOLDER = 'folder' | |||
| OTHER = "other" | |||
| class LLMType(StrEnum): | |||
| CHAT = 'chat' | |||
| EMBEDDING = 'embedding' | |||
| SPEECH2TEXT = 'speech2text' | |||
| IMAGE2TEXT = 'image2text' | |||
| RERANK = 'rerank' | |||
| class ChatStyle(StrEnum): | |||
| CREATIVE = 'Creative' | |||
| PRECISE = 'Precise' | |||
| EVENLY = 'Evenly' | |||
| CUSTOM = 'Custom' | |||
| class TaskStatus(StrEnum): | |||
| UNSTART = "0" | |||
| RUNNING = "1" | |||
| CANCEL = "2" | |||
| DONE = "3" | |||
| FAIL = "4" | |||
| class ParserType(StrEnum): | |||
| PRESENTATION = "presentation" | |||
| LAWS = "laws" | |||
| MANUAL = "manual" | |||
| PAPER = "paper" | |||
| RESUME = "resume" | |||
| BOOK = "book" | |||
| QA = "qa" | |||
| TABLE = "table" | |||
| NAIVE = "naive" | |||
| PICTURE = "picture" | |||
| ONE = "one" | |||
| AUDIO = "audio" | |||
| EMAIL = "email" | |||
| KG = "knowledge_graph" | |||
| class FileSource(StrEnum): | |||
| LOCAL = "" | |||
| KNOWLEDGEBASE = "knowledgebase" | |||
| S3 = "s3" | |||
| class CanvasType(StrEnum): | |||
| ChatBot = "chatbot" | |||
| DocBot = "docbot" | |||
| KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase" | |||
| @@ -1,130 +1,130 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import operator | |||
| from functools import reduce | |||
| from typing import Dict, Type, Union | |||
| from api.utils import current_timestamp, timestamp_to_date | |||
| from api.db.db_models import DB, DataBaseModel | |||
| from api.db.runtime_config import RuntimeConfig | |||
| from api.utils.log_utils import getLogger | |||
| from enum import Enum | |||
| LOGGER = getLogger() | |||
| @DB.connection_context() | |||
| def bulk_insert_into_db(model, data_source, replace_on_conflict=False): | |||
| DB.create_tables([model]) | |||
| for i, data in enumerate(data_source): | |||
| current_time = current_timestamp() + i | |||
| current_date = timestamp_to_date(current_time) | |||
| if 'create_time' not in data: | |||
| data['create_time'] = current_time | |||
| data['create_date'] = timestamp_to_date(data['create_time']) | |||
| data['update_time'] = current_time | |||
| data['update_date'] = current_date | |||
| preserve = tuple(data_source[0].keys() - {'create_time', 'create_date'}) | |||
| batch_size = 1000 | |||
| for i in range(0, len(data_source), batch_size): | |||
| with DB.atomic(): | |||
| query = model.insert_many(data_source[i:i + batch_size]) | |||
| if replace_on_conflict: | |||
| query = query.on_conflict(preserve=preserve) | |||
| query.execute() | |||
| def get_dynamic_db_model(base, job_id): | |||
| return type(base.model( | |||
| table_index=get_dynamic_tracking_table_index(job_id=job_id))) | |||
| def get_dynamic_tracking_table_index(job_id): | |||
| return job_id[:8] | |||
| def fill_db_model_object(model_object, human_model_dict): | |||
| for k, v in human_model_dict.items(): | |||
| attr_name = 'f_%s' % k | |||
| if hasattr(model_object.__class__, attr_name): | |||
| setattr(model_object, attr_name, v) | |||
| return model_object | |||
| # https://docs.peewee-orm.com/en/latest/peewee/query_operators.html | |||
| supported_operators = { | |||
| '==': operator.eq, | |||
| '<': operator.lt, | |||
| '<=': operator.le, | |||
| '>': operator.gt, | |||
| '>=': operator.ge, | |||
| '!=': operator.ne, | |||
| '<<': operator.lshift, | |||
| '>>': operator.rshift, | |||
| '%': operator.mod, | |||
| '**': operator.pow, | |||
| '^': operator.xor, | |||
| '~': operator.inv, | |||
| } | |||
| def query_dict2expression( | |||
| model: Type[DataBaseModel], query: Dict[str, Union[bool, int, str, list, tuple]]): | |||
| expression = [] | |||
| for field, value in query.items(): | |||
| if not isinstance(value, (list, tuple)): | |||
| value = ('==', value) | |||
| op, *val = value | |||
| field = getattr(model, f'f_{field}') | |||
| value = supported_operators[op]( | |||
| field, val[0]) if op in supported_operators else getattr( | |||
| field, op)( | |||
| *val) | |||
| expression.append(value) | |||
| return reduce(operator.iand, expression) | |||
| def query_db(model: Type[DataBaseModel], limit: int = 0, offset: int = 0, | |||
| query: dict = None, order_by: Union[str, list, tuple] = None): | |||
| data = model.select() | |||
| if query: | |||
| data = data.where(query_dict2expression(model, query)) | |||
| count = data.count() | |||
| if not order_by: | |||
| order_by = 'create_time' | |||
| if not isinstance(order_by, (list, tuple)): | |||
| order_by = (order_by, 'asc') | |||
| order_by, order = order_by | |||
| order_by = getattr(model, f'f_{order_by}') | |||
| order_by = getattr(order_by, order)() | |||
| data = data.order_by(order_by) | |||
| if limit > 0: | |||
| data = data.limit(limit) | |||
| if offset > 0: | |||
| data = data.offset(offset) | |||
| return list(data), count | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import operator | |||
| from functools import reduce | |||
| from typing import Dict, Type, Union | |||
| from api.utils import current_timestamp, timestamp_to_date | |||
| from api.db.db_models import DB, DataBaseModel | |||
| from api.db.runtime_config import RuntimeConfig | |||
| from api.utils.log_utils import getLogger | |||
| from enum import Enum | |||
| LOGGER = getLogger() | |||
| @DB.connection_context() | |||
| def bulk_insert_into_db(model, data_source, replace_on_conflict=False): | |||
| DB.create_tables([model]) | |||
| for i, data in enumerate(data_source): | |||
| current_time = current_timestamp() + i | |||
| current_date = timestamp_to_date(current_time) | |||
| if 'create_time' not in data: | |||
| data['create_time'] = current_time | |||
| data['create_date'] = timestamp_to_date(data['create_time']) | |||
| data['update_time'] = current_time | |||
| data['update_date'] = current_date | |||
| preserve = tuple(data_source[0].keys() - {'create_time', 'create_date'}) | |||
| batch_size = 1000 | |||
| for i in range(0, len(data_source), batch_size): | |||
| with DB.atomic(): | |||
| query = model.insert_many(data_source[i:i + batch_size]) | |||
| if replace_on_conflict: | |||
| query = query.on_conflict(preserve=preserve) | |||
| query.execute() | |||
| def get_dynamic_db_model(base, job_id): | |||
| return type(base.model( | |||
| table_index=get_dynamic_tracking_table_index(job_id=job_id))) | |||
| def get_dynamic_tracking_table_index(job_id): | |||
| return job_id[:8] | |||
| def fill_db_model_object(model_object, human_model_dict): | |||
| for k, v in human_model_dict.items(): | |||
| attr_name = 'f_%s' % k | |||
| if hasattr(model_object.__class__, attr_name): | |||
| setattr(model_object, attr_name, v) | |||
| return model_object | |||
| # https://docs.peewee-orm.com/en/latest/peewee/query_operators.html | |||
| supported_operators = { | |||
| '==': operator.eq, | |||
| '<': operator.lt, | |||
| '<=': operator.le, | |||
| '>': operator.gt, | |||
| '>=': operator.ge, | |||
| '!=': operator.ne, | |||
| '<<': operator.lshift, | |||
| '>>': operator.rshift, | |||
| '%': operator.mod, | |||
| '**': operator.pow, | |||
| '^': operator.xor, | |||
| '~': operator.inv, | |||
| } | |||
| def query_dict2expression( | |||
| model: Type[DataBaseModel], query: Dict[str, Union[bool, int, str, list, tuple]]): | |||
| expression = [] | |||
| for field, value in query.items(): | |||
| if not isinstance(value, (list, tuple)): | |||
| value = ('==', value) | |||
| op, *val = value | |||
| field = getattr(model, f'f_{field}') | |||
| value = supported_operators[op]( | |||
| field, val[0]) if op in supported_operators else getattr( | |||
| field, op)( | |||
| *val) | |||
| expression.append(value) | |||
| return reduce(operator.iand, expression) | |||
| def query_db(model: Type[DataBaseModel], limit: int = 0, offset: int = 0, | |||
| query: dict = None, order_by: Union[str, list, tuple] = None): | |||
| data = model.select() | |||
| if query: | |||
| data = data.where(query_dict2expression(model, query)) | |||
| count = data.count() | |||
| if not order_by: | |||
| order_by = 'create_time' | |||
| if not isinstance(order_by, (list, tuple)): | |||
| order_by = (order_by, 'asc') | |||
| order_by, order = order_by | |||
| order_by = getattr(model, f'f_{order_by}') | |||
| order_by = getattr(order_by, order)() | |||
| data = data.order_by(order_by) | |||
| if limit > 0: | |||
| data = data.limit(limit) | |||
| if offset > 0: | |||
| data = data.offset(offset) | |||
| return list(data), count | |||
| @@ -1,184 +1,184 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| import os | |||
| import time | |||
| import uuid | |||
| from copy import deepcopy | |||
| from api.db import LLMType, UserTenantRole | |||
| from api.db.db_models import init_database_tables as init_web_db, LLMFactories, LLM, TenantLLM | |||
| from api.db.services import UserService | |||
| from api.db.services.canvas_service import CanvasTemplateService | |||
| from api.db.services.document_service import DocumentService | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle | |||
| from api.db.services.user_service import TenantService, UserTenantService | |||
| from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL | |||
| from api.utils.file_utils import get_project_base_directory | |||
| def init_superuser(): | |||
| user_info = { | |||
| "id": uuid.uuid1().hex, | |||
| "password": "admin", | |||
| "nickname": "admin", | |||
| "is_superuser": True, | |||
| "email": "admin@ragflow.io", | |||
| "creator": "system", | |||
| "status": "1", | |||
| } | |||
| tenant = { | |||
| "id": user_info["id"], | |||
| "name": user_info["nickname"] + "‘s Kingdom", | |||
| "llm_id": CHAT_MDL, | |||
| "embd_id": EMBEDDING_MDL, | |||
| "asr_id": ASR_MDL, | |||
| "parser_ids": PARSERS, | |||
| "img2txt_id": IMAGE2TEXT_MDL | |||
| } | |||
| usr_tenant = { | |||
| "tenant_id": user_info["id"], | |||
| "user_id": user_info["id"], | |||
| "invited_by": user_info["id"], | |||
| "role": UserTenantRole.OWNER | |||
| } | |||
| tenant_llm = [] | |||
| for llm in LLMService.query(fid=LLM_FACTORY): | |||
| tenant_llm.append( | |||
| {"tenant_id": user_info["id"], "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type, | |||
| "api_key": API_KEY, "api_base": LLM_BASE_URL}) | |||
| if not UserService.save(**user_info): | |||
| print("\033[93m【ERROR】\033[0mcan't init admin.") | |||
| return | |||
| TenantService.insert(**tenant) | |||
| UserTenantService.insert(**usr_tenant) | |||
| TenantLLMService.insert_many(tenant_llm) | |||
| print( | |||
| "【INFO】Super user initialized. \033[93memail: admin@ragflow.io, password: admin\033[0m. Changing the password after logining is strongly recomanded.") | |||
| chat_mdl = LLMBundle(tenant["id"], LLMType.CHAT, tenant["llm_id"]) | |||
| msg = chat_mdl.chat(system="", history=[ | |||
| {"role": "user", "content": "Hello!"}], gen_conf={}) | |||
| if msg.find("ERROR: ") == 0: | |||
| print( | |||
| "\33[91m【ERROR】\33[0m: ", | |||
| "'{}' dosen't work. {}".format( | |||
| tenant["llm_id"], | |||
| msg)) | |||
| embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"]) | |||
| v, c = embd_mdl.encode(["Hello!"]) | |||
| if c == 0: | |||
| print( | |||
| "\33[91m【ERROR】\33[0m:", | |||
| " '{}' dosen't work!".format( | |||
| tenant["embd_id"])) | |||
| def init_llm_factory(): | |||
| try: | |||
| LLMService.filter_delete([(LLM.fid == "MiniMax" or LLM.fid == "Minimax")]) | |||
| except Exception as e: | |||
| pass | |||
| factory_llm_infos = json.load( | |||
| open( | |||
| os.path.join(get_project_base_directory(), "conf", "llm_factories.json"), | |||
| "r", | |||
| ) | |||
| ) | |||
| for factory_llm_info in factory_llm_infos["factory_llm_infos"]: | |||
| llm_infos = factory_llm_info.pop("llm") | |||
| try: | |||
| LLMFactoriesService.save(**factory_llm_info) | |||
| except Exception as e: | |||
| pass | |||
| LLMService.filter_delete([LLM.fid == factory_llm_info["name"]]) | |||
| for llm_info in llm_infos: | |||
| llm_info["fid"] = factory_llm_info["name"] | |||
| try: | |||
| LLMService.save(**llm_info) | |||
| except Exception as e: | |||
| pass | |||
| LLMFactoriesService.filter_delete([LLMFactories.name == "Local"]) | |||
| LLMService.filter_delete([LLM.fid == "Local"]) | |||
| LLMService.filter_delete([LLM.llm_name == "qwen-vl-max"]) | |||
| LLMService.filter_delete([LLM.fid == "Moonshot", LLM.llm_name == "flag-embedding"]) | |||
| TenantLLMService.filter_delete([TenantLLM.llm_factory == "Moonshot", TenantLLM.llm_name == "flag-embedding"]) | |||
| LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"]) | |||
| LLMService.filter_delete([LLMService.model.fid == "QAnything"]) | |||
| TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"}) | |||
| TenantService.filter_update([1 == 1], { | |||
| "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"}) | |||
| ## insert openai two embedding models to the current openai user. | |||
| print("Start to insert 2 OpenAI embedding models...") | |||
| tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()]) | |||
| for tid in tenant_ids: | |||
| for row in TenantLLMService.query(llm_factory="OpenAI", tenant_id=tid): | |||
| row = row.to_dict() | |||
| row["model_type"] = LLMType.EMBEDDING.value | |||
| row["llm_name"] = "text-embedding-3-small" | |||
| row["used_tokens"] = 0 | |||
| try: | |||
| TenantLLMService.save(**row) | |||
| row = deepcopy(row) | |||
| row["llm_name"] = "text-embedding-3-large" | |||
| TenantLLMService.save(**row) | |||
| except Exception as e: | |||
| pass | |||
| break | |||
| for kb_id in KnowledgebaseService.get_all_ids(): | |||
| KnowledgebaseService.update_by_id(kb_id, {"doc_num": DocumentService.get_kb_doc_count(kb_id)}) | |||
| """ | |||
| drop table llm; | |||
| drop table llm_factories; | |||
| update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph'; | |||
| alter table knowledgebase modify avatar longtext; | |||
| alter table user modify avatar longtext; | |||
| alter table dialog modify icon longtext; | |||
| """ | |||
| def add_graph_templates(): | |||
| dir = os.path.join(get_project_base_directory(), "agent", "templates") | |||
| for fnm in os.listdir(dir): | |||
| try: | |||
| cnvs = json.load(open(os.path.join(dir, fnm), "r")) | |||
| try: | |||
| CanvasTemplateService.save(**cnvs) | |||
| except: | |||
| CanvasTemplateService.update_by_id(cnvs["id"], cnvs) | |||
| except Exception as e: | |||
| print("Add graph templates error: ", e) | |||
| print("------------", flush=True) | |||
| def init_web_data(): | |||
| start_time = time.time() | |||
| init_llm_factory() | |||
| if not UserService.get_all().count(): | |||
| init_superuser() | |||
| add_graph_templates() | |||
| print("init web data success:{}".format(time.time() - start_time)) | |||
| if __name__ == '__main__': | |||
| init_web_db() | |||
| init_web_data() | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| import os | |||
| import time | |||
| import uuid | |||
| from copy import deepcopy | |||
| from api.db import LLMType, UserTenantRole | |||
| from api.db.db_models import init_database_tables as init_web_db, LLMFactories, LLM, TenantLLM | |||
| from api.db.services import UserService | |||
| from api.db.services.canvas_service import CanvasTemplateService | |||
| from api.db.services.document_service import DocumentService | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle | |||
| from api.db.services.user_service import TenantService, UserTenantService | |||
| from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL | |||
| from api.utils.file_utils import get_project_base_directory | |||
| def init_superuser(): | |||
| user_info = { | |||
| "id": uuid.uuid1().hex, | |||
| "password": "admin", | |||
| "nickname": "admin", | |||
| "is_superuser": True, | |||
| "email": "admin@ragflow.io", | |||
| "creator": "system", | |||
| "status": "1", | |||
| } | |||
| tenant = { | |||
| "id": user_info["id"], | |||
| "name": user_info["nickname"] + "‘s Kingdom", | |||
| "llm_id": CHAT_MDL, | |||
| "embd_id": EMBEDDING_MDL, | |||
| "asr_id": ASR_MDL, | |||
| "parser_ids": PARSERS, | |||
| "img2txt_id": IMAGE2TEXT_MDL | |||
| } | |||
| usr_tenant = { | |||
| "tenant_id": user_info["id"], | |||
| "user_id": user_info["id"], | |||
| "invited_by": user_info["id"], | |||
| "role": UserTenantRole.OWNER | |||
| } | |||
| tenant_llm = [] | |||
| for llm in LLMService.query(fid=LLM_FACTORY): | |||
| tenant_llm.append( | |||
| {"tenant_id": user_info["id"], "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type, | |||
| "api_key": API_KEY, "api_base": LLM_BASE_URL}) | |||
| if not UserService.save(**user_info): | |||
| print("\033[93m【ERROR】\033[0mcan't init admin.") | |||
| return | |||
| TenantService.insert(**tenant) | |||
| UserTenantService.insert(**usr_tenant) | |||
| TenantLLMService.insert_many(tenant_llm) | |||
| print( | |||
| "【INFO】Super user initialized. \033[93memail: admin@ragflow.io, password: admin\033[0m. Changing the password after logining is strongly recomanded.") | |||
| chat_mdl = LLMBundle(tenant["id"], LLMType.CHAT, tenant["llm_id"]) | |||
| msg = chat_mdl.chat(system="", history=[ | |||
| {"role": "user", "content": "Hello!"}], gen_conf={}) | |||
| if msg.find("ERROR: ") == 0: | |||
| print( | |||
| "\33[91m【ERROR】\33[0m: ", | |||
| "'{}' dosen't work. {}".format( | |||
| tenant["llm_id"], | |||
| msg)) | |||
| embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"]) | |||
| v, c = embd_mdl.encode(["Hello!"]) | |||
| if c == 0: | |||
| print( | |||
| "\33[91m【ERROR】\33[0m:", | |||
| " '{}' dosen't work!".format( | |||
| tenant["embd_id"])) | |||
| def init_llm_factory(): | |||
| try: | |||
| LLMService.filter_delete([(LLM.fid == "MiniMax" or LLM.fid == "Minimax")]) | |||
| except Exception as e: | |||
| pass | |||
| factory_llm_infos = json.load( | |||
| open( | |||
| os.path.join(get_project_base_directory(), "conf", "llm_factories.json"), | |||
| "r", | |||
| ) | |||
| ) | |||
| for factory_llm_info in factory_llm_infos["factory_llm_infos"]: | |||
| llm_infos = factory_llm_info.pop("llm") | |||
| try: | |||
| LLMFactoriesService.save(**factory_llm_info) | |||
| except Exception as e: | |||
| pass | |||
| LLMService.filter_delete([LLM.fid == factory_llm_info["name"]]) | |||
| for llm_info in llm_infos: | |||
| llm_info["fid"] = factory_llm_info["name"] | |||
| try: | |||
| LLMService.save(**llm_info) | |||
| except Exception as e: | |||
| pass | |||
| LLMFactoriesService.filter_delete([LLMFactories.name == "Local"]) | |||
| LLMService.filter_delete([LLM.fid == "Local"]) | |||
| LLMService.filter_delete([LLM.llm_name == "qwen-vl-max"]) | |||
| LLMService.filter_delete([LLM.fid == "Moonshot", LLM.llm_name == "flag-embedding"]) | |||
| TenantLLMService.filter_delete([TenantLLM.llm_factory == "Moonshot", TenantLLM.llm_name == "flag-embedding"]) | |||
| LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"]) | |||
| LLMService.filter_delete([LLMService.model.fid == "QAnything"]) | |||
| TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"}) | |||
| TenantService.filter_update([1 == 1], { | |||
| "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"}) | |||
| ## insert openai two embedding models to the current openai user. | |||
| print("Start to insert 2 OpenAI embedding models...") | |||
| tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()]) | |||
| for tid in tenant_ids: | |||
| for row in TenantLLMService.query(llm_factory="OpenAI", tenant_id=tid): | |||
| row = row.to_dict() | |||
| row["model_type"] = LLMType.EMBEDDING.value | |||
| row["llm_name"] = "text-embedding-3-small" | |||
| row["used_tokens"] = 0 | |||
| try: | |||
| TenantLLMService.save(**row) | |||
| row = deepcopy(row) | |||
| row["llm_name"] = "text-embedding-3-large" | |||
| TenantLLMService.save(**row) | |||
| except Exception as e: | |||
| pass | |||
| break | |||
| for kb_id in KnowledgebaseService.get_all_ids(): | |||
| KnowledgebaseService.update_by_id(kb_id, {"doc_num": DocumentService.get_kb_doc_count(kb_id)}) | |||
| """ | |||
| drop table llm; | |||
| drop table llm_factories; | |||
| update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph'; | |||
| alter table knowledgebase modify avatar longtext; | |||
| alter table user modify avatar longtext; | |||
| alter table dialog modify icon longtext; | |||
| """ | |||
| def add_graph_templates(): | |||
| dir = os.path.join(get_project_base_directory(), "agent", "templates") | |||
| for fnm in os.listdir(dir): | |||
| try: | |||
| cnvs = json.load(open(os.path.join(dir, fnm), "r")) | |||
| try: | |||
| CanvasTemplateService.save(**cnvs) | |||
| except: | |||
| CanvasTemplateService.update_by_id(cnvs["id"], cnvs) | |||
| except Exception as e: | |||
| print("Add graph templates error: ", e) | |||
| print("------------", flush=True) | |||
| def init_web_data(): | |||
| start_time = time.time() | |||
| init_llm_factory() | |||
| if not UserService.get_all().count(): | |||
| init_superuser() | |||
| add_graph_templates() | |||
| print("init web data success:{}".format(time.time() - start_time)) | |||
| if __name__ == '__main__': | |||
| init_web_db() | |||
| init_web_data() | |||
| @@ -1,21 +1,21 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import operator | |||
| import time | |||
| import typing | |||
| from api.utils.log_utils import sql_logger | |||
| import peewee | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import operator | |||
| import time | |||
| import typing | |||
| from api.utils.log_utils import sql_logger | |||
| import peewee | |||
| @@ -1,28 +1,28 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| class ReloadConfigBase: | |||
| @classmethod | |||
| def get_all(cls): | |||
| configs = {} | |||
| for k, v in cls.__dict__.items(): | |||
| if not callable(getattr(cls, k)) and not k.startswith( | |||
| "__") and not k.startswith("_"): | |||
| configs[k] = v | |||
| return configs | |||
| @classmethod | |||
| def get(cls, config_name): | |||
| return getattr(cls, config_name) if hasattr(cls, config_name) else None | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| class ReloadConfigBase: | |||
| @classmethod | |||
| def get_all(cls): | |||
| configs = {} | |||
| for k, v in cls.__dict__.items(): | |||
| if not callable(getattr(cls, k)) and not k.startswith( | |||
| "__") and not k.startswith("_"): | |||
| configs[k] = v | |||
| return configs | |||
| @classmethod | |||
| def get(cls, config_name): | |||
| return getattr(cls, config_name) if hasattr(cls, config_name) else None | |||
| @@ -1,54 +1,54 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from api.versions import get_versions | |||
| from .reload_config_base import ReloadConfigBase | |||
| class RuntimeConfig(ReloadConfigBase): | |||
| DEBUG = None | |||
| WORK_MODE = None | |||
| HTTP_PORT = None | |||
| JOB_SERVER_HOST = None | |||
| JOB_SERVER_VIP = None | |||
| ENV = dict() | |||
| SERVICE_DB = None | |||
| LOAD_CONFIG_MANAGER = False | |||
| @classmethod | |||
| def init_config(cls, **kwargs): | |||
| for k, v in kwargs.items(): | |||
| if hasattr(cls, k): | |||
| setattr(cls, k, v) | |||
| @classmethod | |||
| def init_env(cls): | |||
| cls.ENV.update(get_versions()) | |||
| @classmethod | |||
| def load_config_manager(cls): | |||
| cls.LOAD_CONFIG_MANAGER = True | |||
| @classmethod | |||
| def get_env(cls, key): | |||
| return cls.ENV.get(key, None) | |||
| @classmethod | |||
| def get_all_env(cls): | |||
| return cls.ENV | |||
| @classmethod | |||
| def set_service_db(cls, service_db): | |||
| cls.SERVICE_DB = service_db | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from api.versions import get_versions | |||
| from .reload_config_base import ReloadConfigBase | |||
| class RuntimeConfig(ReloadConfigBase): | |||
| DEBUG = None | |||
| WORK_MODE = None | |||
| HTTP_PORT = None | |||
| JOB_SERVER_HOST = None | |||
| JOB_SERVER_VIP = None | |||
| ENV = dict() | |||
| SERVICE_DB = None | |||
| LOAD_CONFIG_MANAGER = False | |||
| @classmethod | |||
| def init_config(cls, **kwargs): | |||
| for k, v in kwargs.items(): | |||
| if hasattr(cls, k): | |||
| setattr(cls, k, v) | |||
| @classmethod | |||
| def init_env(cls): | |||
| cls.ENV.update(get_versions()) | |||
| @classmethod | |||
| def load_config_manager(cls): | |||
| cls.LOAD_CONFIG_MANAGER = True | |||
| @classmethod | |||
| def get_env(cls, key): | |||
| return cls.ENV.get(key, None) | |||
| @classmethod | |||
| def get_all_env(cls): | |||
| return cls.ENV | |||
| @classmethod | |||
| def set_service_db(cls, service_db): | |||
| cls.SERVICE_DB = service_db | |||
| @@ -1,38 +1,38 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import pathlib | |||
| import re | |||
| from .user_service import UserService | |||
| def duplicate_name(query_func, **kwargs): | |||
| fnm = kwargs["name"] | |||
| objs = query_func(**kwargs) | |||
| if not objs: return fnm | |||
| ext = pathlib.Path(fnm).suffix #.jpg | |||
| nm = re.sub(r"%s$"%ext, "", fnm) | |||
| r = re.search(r"\(([0-9]+)\)$", nm) | |||
| c = 0 | |||
| if r: | |||
| c = int(r.group(1)) | |||
| nm = re.sub(r"\([0-9]+\)$", "", nm) | |||
| c += 1 | |||
| nm = f"{nm}({c})" | |||
| if ext: nm += f"{ext}" | |||
| kwargs["name"] = nm | |||
| return duplicate_name(query_func, **kwargs) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import pathlib | |||
| import re | |||
| from .user_service import UserService | |||
| def duplicate_name(query_func, **kwargs): | |||
| fnm = kwargs["name"] | |||
| objs = query_func(**kwargs) | |||
| if not objs: return fnm | |||
| ext = pathlib.Path(fnm).suffix #.jpg | |||
| nm = re.sub(r"%s$"%ext, "", fnm) | |||
| r = re.search(r"\(([0-9]+)\)$", nm) | |||
| c = 0 | |||
| if r: | |||
| c = int(r.group(1)) | |||
| nm = re.sub(r"\([0-9]+\)$", "", nm) | |||
| c += 1 | |||
| nm = f"{nm}({c})" | |||
| if ext: nm += f"{ext}" | |||
| kwargs["name"] = nm | |||
| return duplicate_name(query_func, **kwargs) | |||
| @@ -1,68 +1,68 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from datetime import datetime | |||
| import peewee | |||
| from api.db.db_models import DB, API4Conversation, APIToken, Dialog | |||
| from api.db.services.common_service import CommonService | |||
| from api.utils import current_timestamp, datetime_format | |||
| class APITokenService(CommonService): | |||
| model = APIToken | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def used(cls, token): | |||
| return cls.model.update({ | |||
| "update_time": current_timestamp(), | |||
| "update_date": datetime_format(datetime.now()), | |||
| }).where( | |||
| cls.model.token == token | |||
| ) | |||
| class API4ConversationService(CommonService): | |||
| model = API4Conversation | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def append_message(cls, id, conversation): | |||
| cls.update_by_id(id, conversation) | |||
| return cls.model.update(round=cls.model.round + 1).where(cls.model.id==id).execute() | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def stats(cls, tenant_id, from_date, to_date, source=None): | |||
| if len(to_date) == 10: to_date += " 23:59:59" | |||
| return cls.model.select( | |||
| cls.model.create_date.truncate("day").alias("dt"), | |||
| peewee.fn.COUNT( | |||
| cls.model.id).alias("pv"), | |||
| peewee.fn.COUNT( | |||
| cls.model.user_id.distinct()).alias("uv"), | |||
| peewee.fn.SUM( | |||
| cls.model.tokens).alias("tokens"), | |||
| peewee.fn.SUM( | |||
| cls.model.duration).alias("duration"), | |||
| peewee.fn.AVG( | |||
| cls.model.round).alias("round"), | |||
| peewee.fn.SUM( | |||
| cls.model.thumb_up).alias("thumb_up") | |||
| ).join(Dialog, on=(cls.model.dialog_id == Dialog.id & Dialog.tenant_id == tenant_id)).where( | |||
| cls.model.create_date >= from_date, | |||
| cls.model.create_date <= to_date, | |||
| cls.model.source == source | |||
| ).group_by(cls.model.create_date.truncate("day")).dicts() | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from datetime import datetime | |||
| import peewee | |||
| from api.db.db_models import DB, API4Conversation, APIToken, Dialog | |||
| from api.db.services.common_service import CommonService | |||
| from api.utils import current_timestamp, datetime_format | |||
| class APITokenService(CommonService): | |||
| model = APIToken | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def used(cls, token): | |||
| return cls.model.update({ | |||
| "update_time": current_timestamp(), | |||
| "update_date": datetime_format(datetime.now()), | |||
| }).where( | |||
| cls.model.token == token | |||
| ) | |||
| class API4ConversationService(CommonService): | |||
| model = API4Conversation | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def append_message(cls, id, conversation): | |||
| cls.update_by_id(id, conversation) | |||
| return cls.model.update(round=cls.model.round + 1).where(cls.model.id==id).execute() | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def stats(cls, tenant_id, from_date, to_date, source=None): | |||
| if len(to_date) == 10: to_date += " 23:59:59" | |||
| return cls.model.select( | |||
| cls.model.create_date.truncate("day").alias("dt"), | |||
| peewee.fn.COUNT( | |||
| cls.model.id).alias("pv"), | |||
| peewee.fn.COUNT( | |||
| cls.model.user_id.distinct()).alias("uv"), | |||
| peewee.fn.SUM( | |||
| cls.model.tokens).alias("tokens"), | |||
| peewee.fn.SUM( | |||
| cls.model.duration).alias("duration"), | |||
| peewee.fn.AVG( | |||
| cls.model.round).alias("round"), | |||
| peewee.fn.SUM( | |||
| cls.model.thumb_up).alias("thumb_up") | |||
| ).join(Dialog, on=(cls.model.dialog_id == Dialog.id & Dialog.tenant_id == tenant_id)).where( | |||
| cls.model.create_date >= from_date, | |||
| cls.model.create_date <= to_date, | |||
| cls.model.source == source | |||
| ).group_by(cls.model.create_date.truncate("day")).dicts() | |||
| @@ -1,183 +1,183 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from datetime import datetime | |||
| import peewee | |||
| from api.db.db_models import DB | |||
| from api.utils import datetime_format, current_timestamp, get_uuid | |||
| class CommonService: | |||
| model = None | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def query(cls, cols=None, reverse=None, order_by=None, **kwargs): | |||
| return cls.model.query(cols=cols, reverse=reverse, | |||
| order_by=order_by, **kwargs) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_all(cls, cols=None, reverse=None, order_by=None): | |||
| if cols: | |||
| query_records = cls.model.select(*cols) | |||
| else: | |||
| query_records = cls.model.select() | |||
| if reverse is not None: | |||
| if not order_by or not hasattr(cls, order_by): | |||
| order_by = "create_time" | |||
| if reverse is True: | |||
| query_records = query_records.order_by( | |||
| cls.model.getter_by(order_by).desc()) | |||
| elif reverse is False: | |||
| query_records = query_records.order_by( | |||
| cls.model.getter_by(order_by).asc()) | |||
| return query_records | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get(cls, **kwargs): | |||
| return cls.model.get(**kwargs) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_or_none(cls, **kwargs): | |||
| try: | |||
| return cls.model.get(**kwargs) | |||
| except peewee.DoesNotExist: | |||
| return None | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def save(cls, **kwargs): | |||
| # if "id" not in kwargs: | |||
| # kwargs["id"] = get_uuid() | |||
| sample_obj = cls.model(**kwargs).save(force_insert=True) | |||
| return sample_obj | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def insert(cls, **kwargs): | |||
| if "id" not in kwargs: | |||
| kwargs["id"] = get_uuid() | |||
| kwargs["create_time"] = current_timestamp() | |||
| kwargs["create_date"] = datetime_format(datetime.now()) | |||
| kwargs["update_time"] = current_timestamp() | |||
| kwargs["update_date"] = datetime_format(datetime.now()) | |||
| sample_obj = cls.model(**kwargs).save(force_insert=True) | |||
| return sample_obj | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def insert_many(cls, data_list, batch_size=100): | |||
| with DB.atomic(): | |||
| for d in data_list: | |||
| d["create_time"] = current_timestamp() | |||
| d["create_date"] = datetime_format(datetime.now()) | |||
| for i in range(0, len(data_list), batch_size): | |||
| cls.model.insert_many(data_list[i:i + batch_size]).execute() | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_many_by_id(cls, data_list): | |||
| with DB.atomic(): | |||
| for data in data_list: | |||
| data["update_time"] = current_timestamp() | |||
| data["update_date"] = datetime_format(datetime.now()) | |||
| cls.model.update(data).where( | |||
| cls.model.id == data["id"]).execute() | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_by_id(cls, pid, data): | |||
| data["update_time"] = current_timestamp() | |||
| data["update_date"] = datetime_format(datetime.now()) | |||
| num = cls.model.update(data).where(cls.model.id == pid).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_id(cls, pid): | |||
| try: | |||
| obj = cls.model.query(id=pid)[0] | |||
| return True, obj | |||
| except Exception as e: | |||
| return False, None | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_ids(cls, pids, cols=None): | |||
| if cols: | |||
| objs = cls.model.select(*cols) | |||
| else: | |||
| objs = cls.model.select() | |||
| return objs.where(cls.model.id.in_(pids)) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def delete_by_id(cls, pid): | |||
| return cls.model.delete().where(cls.model.id == pid).execute() | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def filter_delete(cls, filters): | |||
| with DB.atomic(): | |||
| num = cls.model.delete().where(*filters).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def filter_update(cls, filters, update_data): | |||
| with DB.atomic(): | |||
| return cls.model.update(update_data).where(*filters).execute() | |||
| @staticmethod | |||
| def cut_list(tar_list, n): | |||
| length = len(tar_list) | |||
| arr = range(length) | |||
| result = [tuple(tar_list[x:(x + n)]) for x in arr[::n]] | |||
| return result | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def filter_scope_list(cls, in_key, in_filters_list, | |||
| filters=None, cols=None): | |||
| in_filters_tuple_list = cls.cut_list(in_filters_list, 20) | |||
| if not filters: | |||
| filters = [] | |||
| res_list = [] | |||
| if cols: | |||
| for i in in_filters_tuple_list: | |||
| query_records = cls.model.select( | |||
| * | |||
| cols).where( | |||
| getattr( | |||
| cls.model, | |||
| in_key).in_(i), | |||
| * | |||
| filters) | |||
| if query_records: | |||
| res_list.extend( | |||
| [query_record for query_record in query_records]) | |||
| else: | |||
| for i in in_filters_tuple_list: | |||
| query_records = cls.model.select().where( | |||
| getattr(cls.model, in_key).in_(i), *filters) | |||
| if query_records: | |||
| res_list.extend( | |||
| [query_record for query_record in query_records]) | |||
| return res_list | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from datetime import datetime | |||
| import peewee | |||
| from api.db.db_models import DB | |||
| from api.utils import datetime_format, current_timestamp, get_uuid | |||
| class CommonService: | |||
| model = None | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def query(cls, cols=None, reverse=None, order_by=None, **kwargs): | |||
| return cls.model.query(cols=cols, reverse=reverse, | |||
| order_by=order_by, **kwargs) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_all(cls, cols=None, reverse=None, order_by=None): | |||
| if cols: | |||
| query_records = cls.model.select(*cols) | |||
| else: | |||
| query_records = cls.model.select() | |||
| if reverse is not None: | |||
| if not order_by or not hasattr(cls, order_by): | |||
| order_by = "create_time" | |||
| if reverse is True: | |||
| query_records = query_records.order_by( | |||
| cls.model.getter_by(order_by).desc()) | |||
| elif reverse is False: | |||
| query_records = query_records.order_by( | |||
| cls.model.getter_by(order_by).asc()) | |||
| return query_records | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get(cls, **kwargs): | |||
| return cls.model.get(**kwargs) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_or_none(cls, **kwargs): | |||
| try: | |||
| return cls.model.get(**kwargs) | |||
| except peewee.DoesNotExist: | |||
| return None | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def save(cls, **kwargs): | |||
| # if "id" not in kwargs: | |||
| # kwargs["id"] = get_uuid() | |||
| sample_obj = cls.model(**kwargs).save(force_insert=True) | |||
| return sample_obj | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def insert(cls, **kwargs): | |||
| if "id" not in kwargs: | |||
| kwargs["id"] = get_uuid() | |||
| kwargs["create_time"] = current_timestamp() | |||
| kwargs["create_date"] = datetime_format(datetime.now()) | |||
| kwargs["update_time"] = current_timestamp() | |||
| kwargs["update_date"] = datetime_format(datetime.now()) | |||
| sample_obj = cls.model(**kwargs).save(force_insert=True) | |||
| return sample_obj | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def insert_many(cls, data_list, batch_size=100): | |||
| with DB.atomic(): | |||
| for d in data_list: | |||
| d["create_time"] = current_timestamp() | |||
| d["create_date"] = datetime_format(datetime.now()) | |||
| for i in range(0, len(data_list), batch_size): | |||
| cls.model.insert_many(data_list[i:i + batch_size]).execute() | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_many_by_id(cls, data_list): | |||
| with DB.atomic(): | |||
| for data in data_list: | |||
| data["update_time"] = current_timestamp() | |||
| data["update_date"] = datetime_format(datetime.now()) | |||
| cls.model.update(data).where( | |||
| cls.model.id == data["id"]).execute() | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_by_id(cls, pid, data): | |||
| data["update_time"] = current_timestamp() | |||
| data["update_date"] = datetime_format(datetime.now()) | |||
| num = cls.model.update(data).where(cls.model.id == pid).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_id(cls, pid): | |||
| try: | |||
| obj = cls.model.query(id=pid)[0] | |||
| return True, obj | |||
| except Exception as e: | |||
| return False, None | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_ids(cls, pids, cols=None): | |||
| if cols: | |||
| objs = cls.model.select(*cols) | |||
| else: | |||
| objs = cls.model.select() | |||
| return objs.where(cls.model.id.in_(pids)) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def delete_by_id(cls, pid): | |||
| return cls.model.delete().where(cls.model.id == pid).execute() | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def filter_delete(cls, filters): | |||
| with DB.atomic(): | |||
| num = cls.model.delete().where(*filters).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def filter_update(cls, filters, update_data): | |||
| with DB.atomic(): | |||
| return cls.model.update(update_data).where(*filters).execute() | |||
| @staticmethod | |||
| def cut_list(tar_list, n): | |||
| length = len(tar_list) | |||
| arr = range(length) | |||
| result = [tuple(tar_list[x:(x + n)]) for x in arr[::n]] | |||
| return result | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def filter_scope_list(cls, in_key, in_filters_list, | |||
| filters=None, cols=None): | |||
| in_filters_tuple_list = cls.cut_list(in_filters_list, 20) | |||
| if not filters: | |||
| filters = [] | |||
| res_list = [] | |||
| if cols: | |||
| for i in in_filters_tuple_list: | |||
| query_records = cls.model.select( | |||
| * | |||
| cols).where( | |||
| getattr( | |||
| cls.model, | |||
| in_key).in_(i), | |||
| * | |||
| filters) | |||
| if query_records: | |||
| res_list.extend( | |||
| [query_record for query_record in query_records]) | |||
| else: | |||
| for i in in_filters_tuple_list: | |||
| query_records = cls.model.select().where( | |||
| getattr(cls.model, in_key).in_(i), *filters) | |||
| if query_records: | |||
| res_list.extend( | |||
| [query_record for query_record in query_records]) | |||
| return res_list | |||
| @@ -1,392 +1,392 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import json | |||
| import re | |||
| from copy import deepcopy | |||
| from api.db import LLMType, ParserType | |||
| from api.db.db_models import Dialog, Conversation | |||
| from api.db.services.common_service import CommonService | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle | |||
| from api.settings import chat_logger, retrievaler, kg_retrievaler | |||
| from rag.app.resume import forbidden_select_fields4resume | |||
| from rag.nlp import keyword_extraction | |||
| from rag.nlp.search import index_name | |||
| from rag.utils import rmSpace, num_tokens_from_string, encoder | |||
| from api.utils.file_utils import get_project_base_directory | |||
| class DialogService(CommonService): | |||
| model = Dialog | |||
| class ConversationService(CommonService): | |||
| model = Conversation | |||
| def message_fit_in(msg, max_length=4000): | |||
| def count(): | |||
| nonlocal msg | |||
| tks_cnts = [] | |||
| for m in msg: | |||
| tks_cnts.append( | |||
| {"role": m["role"], "count": num_tokens_from_string(m["content"])}) | |||
| total = 0 | |||
| for m in tks_cnts: | |||
| total += m["count"] | |||
| return total | |||
| c = count() | |||
| if c < max_length: | |||
| return c, msg | |||
| msg_ = [m for m in msg[:-1] if m["role"] == "system"] | |||
| msg_.append(msg[-1]) | |||
| msg = msg_ | |||
| c = count() | |||
| if c < max_length: | |||
| return c, msg | |||
| ll = num_tokens_from_string(msg_[0]["content"]) | |||
| l = num_tokens_from_string(msg_[-1]["content"]) | |||
| if ll / (ll + l) > 0.8: | |||
| m = msg_[0]["content"] | |||
| m = encoder.decode(encoder.encode(m)[:max_length - l]) | |||
| msg[0]["content"] = m | |||
| return max_length, msg | |||
| m = msg_[1]["content"] | |||
| m = encoder.decode(encoder.encode(m)[:max_length - l]) | |||
| msg[1]["content"] = m | |||
| return max_length, msg | |||
| def llm_id2llm_type(llm_id): | |||
| fnm = os.path.join(get_project_base_directory(), "conf") | |||
| llm_factories = json.load(open(os.path.join(fnm, "llm_factories.json"), "r")) | |||
| for llm_factory in llm_factories["factory_llm_infos"]: | |||
| for llm in llm_factory["llm"]: | |||
| if llm_id == llm["llm_name"]: | |||
| return llm["model_type"].strip(",")[-1] | |||
| def chat(dialog, messages, stream=True, **kwargs): | |||
| assert messages[-1]["role"] == "user", "The last content of this conversation is not from user." | |||
| llm = LLMService.query(llm_name=dialog.llm_id) | |||
| if not llm: | |||
| llm = TenantLLMService.query(tenant_id=dialog.tenant_id, llm_name=dialog.llm_id) | |||
| if not llm: | |||
| raise LookupError("LLM(%s) not found" % dialog.llm_id) | |||
| max_tokens = 8192 | |||
| else: | |||
| max_tokens = llm[0].max_tokens | |||
| kbs = KnowledgebaseService.get_by_ids(dialog.kb_ids) | |||
| embd_nms = list(set([kb.embd_id for kb in kbs])) | |||
| if len(embd_nms) != 1: | |||
| yield {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []} | |||
| return {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []} | |||
| is_kg = all([kb.parser_id == ParserType.KG for kb in kbs]) | |||
| retr = retrievaler if not is_kg else kg_retrievaler | |||
| questions = [m["content"] for m in messages if m["role"] == "user"][-3:] | |||
| attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None | |||
| if "doc_ids" in messages[-1]: | |||
| attachments = messages[-1]["doc_ids"] | |||
| for m in messages[:-1]: | |||
| if "doc_ids" in m: | |||
| attachments.extend(m["doc_ids"]) | |||
| embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0]) | |||
| if llm_id2llm_type(dialog.llm_id) == "image2text": | |||
| chat_mdl = LLMBundle(dialog.tenant_id, LLMType.IMAGE2TEXT, dialog.llm_id) | |||
| else: | |||
| chat_mdl = LLMBundle(dialog.tenant_id, LLMType.CHAT, dialog.llm_id) | |||
| prompt_config = dialog.prompt_config | |||
| field_map = KnowledgebaseService.get_field_map(dialog.kb_ids) | |||
| # try to use sql if field mapping is good to go | |||
| if field_map: | |||
| chat_logger.info("Use SQL to retrieval:{}".format(questions[-1])) | |||
| ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True)) | |||
| if ans: | |||
| yield ans | |||
| return | |||
| for p in prompt_config["parameters"]: | |||
| if p["key"] == "knowledge": | |||
| continue | |||
| if p["key"] not in kwargs and not p["optional"]: | |||
| raise KeyError("Miss parameter: " + p["key"]) | |||
| if p["key"] not in kwargs: | |||
| prompt_config["system"] = prompt_config["system"].replace( | |||
| "{%s}" % p["key"], " ") | |||
| rerank_mdl = None | |||
| if dialog.rerank_id: | |||
| rerank_mdl = LLMBundle(dialog.tenant_id, LLMType.RERANK, dialog.rerank_id) | |||
| for _ in range(len(questions) // 2): | |||
| questions.append(questions[-1]) | |||
| if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]: | |||
| kbinfos = {"total": 0, "chunks": [], "doc_aggs": []} | |||
| else: | |||
| if prompt_config.get("keyword", False): | |||
| questions[-1] += keyword_extraction(chat_mdl, questions[-1]) | |||
| kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, | |||
| dialog.similarity_threshold, | |||
| dialog.vector_similarity_weight, | |||
| doc_ids=attachments, | |||
| top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl) | |||
| knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] | |||
| #self-rag | |||
| if dialog.prompt_config.get("self_rag") and not relevant(dialog.tenant_id, dialog.llm_id, questions[-1], knowledges): | |||
| questions[-1] = rewrite(dialog.tenant_id, dialog.llm_id, questions[-1]) | |||
| kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, | |||
| dialog.similarity_threshold, | |||
| dialog.vector_similarity_weight, | |||
| doc_ids=attachments, | |||
| top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl) | |||
| knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] | |||
| chat_logger.info( | |||
| "{}->{}".format(" ".join(questions), "\n->".join(knowledges))) | |||
| if not knowledges and prompt_config.get("empty_response"): | |||
| yield {"answer": prompt_config["empty_response"], "reference": kbinfos} | |||
| return {"answer": prompt_config["empty_response"], "reference": kbinfos} | |||
| kwargs["knowledge"] = "\n".join(knowledges) | |||
| gen_conf = dialog.llm_setting | |||
| msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}] | |||
| msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])} | |||
| for m in messages if m["role"] != "system"]) | |||
| used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.97)) | |||
| assert len(msg) >= 2, f"message_fit_in has bug: {msg}" | |||
| if "max_tokens" in gen_conf: | |||
| gen_conf["max_tokens"] = min( | |||
| gen_conf["max_tokens"], | |||
| max_tokens - used_token_count) | |||
| def decorate_answer(answer): | |||
| nonlocal prompt_config, knowledges, kwargs, kbinfos | |||
| refs = [] | |||
| if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): | |||
| answer, idx = retr.insert_citations(answer, | |||
| [ck["content_ltks"] | |||
| for ck in kbinfos["chunks"]], | |||
| [ck["vector"] | |||
| for ck in kbinfos["chunks"]], | |||
| embd_mdl, | |||
| tkweight=1 - dialog.vector_similarity_weight, | |||
| vtweight=dialog.vector_similarity_weight) | |||
| idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) | |||
| recall_docs = [ | |||
| d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] | |||
| if not recall_docs: recall_docs = kbinfos["doc_aggs"] | |||
| kbinfos["doc_aggs"] = recall_docs | |||
| refs = deepcopy(kbinfos) | |||
| for c in refs["chunks"]: | |||
| if c.get("vector"): | |||
| del c["vector"] | |||
| if answer.lower().find("invalid key") >= 0 or answer.lower().find("invalid api") >= 0: | |||
| answer += " Please set LLM API-Key in 'User Setting -> Model Providers -> API-Key'" | |||
| return {"answer": answer, "reference": refs} | |||
| if stream: | |||
| answer = "" | |||
| for ans in chat_mdl.chat_streamly(msg[0]["content"], msg[1:], gen_conf): | |||
| answer = ans | |||
| yield {"answer": answer, "reference": {}} | |||
| yield decorate_answer(answer) | |||
| else: | |||
| answer = chat_mdl.chat( | |||
| msg[0]["content"], msg[1:], gen_conf) | |||
| chat_logger.info("User: {}|Assistant: {}".format( | |||
| msg[-1]["content"], answer)) | |||
| yield decorate_answer(answer) | |||
| def use_sql(question, field_map, tenant_id, chat_mdl, quota=True): | |||
| sys_prompt = "你是一个DBA。你需要这对以下表的字段结构,根据用户的问题列表,写出最后一个问题对应的SQL。" | |||
| user_promt = """ | |||
| 表名:{}; | |||
| 数据库表字段说明如下: | |||
| {} | |||
| 问题如下: | |||
| {} | |||
| 请写出SQL, 且只要SQL,不要有其他说明及文字。 | |||
| """.format( | |||
| index_name(tenant_id), | |||
| "\n".join([f"{k}: {v}" for k, v in field_map.items()]), | |||
| question | |||
| ) | |||
| tried_times = 0 | |||
| def get_table(): | |||
| nonlocal sys_prompt, user_promt, question, tried_times | |||
| sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], { | |||
| "temperature": 0.06}) | |||
| print(user_promt, sql) | |||
| chat_logger.info(f"“{question}”==>{user_promt} get SQL: {sql}") | |||
| sql = re.sub(r"[\r\n]+", " ", sql.lower()) | |||
| sql = re.sub(r".*select ", "select ", sql.lower()) | |||
| sql = re.sub(r" +", " ", sql) | |||
| sql = re.sub(r"([;;]|```).*", "", sql) | |||
| if sql[:len("select ")] != "select ": | |||
| return None, None | |||
| if not re.search(r"((sum|avg|max|min)\(|group by )", sql.lower()): | |||
| if sql[:len("select *")] != "select *": | |||
| sql = "select doc_id,docnm_kwd," + sql[6:] | |||
| else: | |||
| flds = [] | |||
| for k in field_map.keys(): | |||
| if k in forbidden_select_fields4resume: | |||
| continue | |||
| if len(flds) > 11: | |||
| break | |||
| flds.append(k) | |||
| sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:] | |||
| print(f"“{question}” get SQL(refined): {sql}") | |||
| chat_logger.info(f"“{question}” get SQL(refined): {sql}") | |||
| tried_times += 1 | |||
| return retrievaler.sql_retrieval(sql, format="json"), sql | |||
| tbl, sql = get_table() | |||
| if tbl is None: | |||
| return None | |||
| if tbl.get("error") and tried_times <= 2: | |||
| user_promt = """ | |||
| 表名:{}; | |||
| 数据库表字段说明如下: | |||
| {} | |||
| 问题如下: | |||
| {} | |||
| 你上一次给出的错误SQL如下: | |||
| {} | |||
| 后台报错如下: | |||
| {} | |||
| 请纠正SQL中的错误再写一遍,且只要SQL,不要有其他说明及文字。 | |||
| """.format( | |||
| index_name(tenant_id), | |||
| "\n".join([f"{k}: {v}" for k, v in field_map.items()]), | |||
| question, sql, tbl["error"] | |||
| ) | |||
| tbl, sql = get_table() | |||
| chat_logger.info("TRY it again: {}".format(sql)) | |||
| chat_logger.info("GET table: {}".format(tbl)) | |||
| print(tbl) | |||
| if tbl.get("error") or len(tbl["rows"]) == 0: | |||
| return None | |||
| docid_idx = set([ii for ii, c in enumerate( | |||
| tbl["columns"]) if c["name"] == "doc_id"]) | |||
| docnm_idx = set([ii for ii, c in enumerate( | |||
| tbl["columns"]) if c["name"] == "docnm_kwd"]) | |||
| clmn_idx = [ii for ii in range( | |||
| len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)] | |||
| # compose markdown table | |||
| clmns = "|" + "|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], | |||
| tbl["columns"][i]["name"])) for i in | |||
| clmn_idx]) + ("|Source|" if docid_idx and docid_idx else "|") | |||
| line = "|" + "|".join(["------" for _ in range(len(clmn_idx))]) + \ | |||
| ("|------|" if docid_idx and docid_idx else "") | |||
| rows = ["|" + | |||
| "|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + | |||
| "|" for r in tbl["rows"]] | |||
| if quota: | |||
| rows = "\n".join([r + f" ##{ii}$$ |" for ii, r in enumerate(rows)]) | |||
| else: | |||
| rows = "\n".join([r + f" ##{ii}$$ |" for ii, r in enumerate(rows)]) | |||
| rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows) | |||
| if not docid_idx or not docnm_idx: | |||
| chat_logger.warning("SQL missing field: " + sql) | |||
| return { | |||
| "answer": "\n".join([clmns, line, rows]), | |||
| "reference": {"chunks": [], "doc_aggs": []} | |||
| } | |||
| docid_idx = list(docid_idx)[0] | |||
| docnm_idx = list(docnm_idx)[0] | |||
| doc_aggs = {} | |||
| for r in tbl["rows"]: | |||
| if r[docid_idx] not in doc_aggs: | |||
| doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0} | |||
| doc_aggs[r[docid_idx]]["count"] += 1 | |||
| return { | |||
| "answer": "\n".join([clmns, line, rows]), | |||
| "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]], | |||
| "doc_aggs": [{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in | |||
| doc_aggs.items()]} | |||
| } | |||
| def relevant(tenant_id, llm_id, question, contents: list): | |||
| if llm_id2llm_type(llm_id) == "image2text": | |||
| chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id) | |||
| else: | |||
| chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id) | |||
| prompt = """ | |||
| You are a grader assessing relevance of a retrieved document to a user question. | |||
| It does not need to be a stringent test. The goal is to filter out erroneous retrievals. | |||
| If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. | |||
| Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. | |||
| No other words needed except 'yes' or 'no'. | |||
| """ | |||
| if not contents:return False | |||
| contents = "Documents: \n" + " - ".join(contents) | |||
| contents = f"Question: {question}\n" + contents | |||
| if num_tokens_from_string(contents) >= chat_mdl.max_length - 4: | |||
| contents = encoder.decode(encoder.encode(contents)[:chat_mdl.max_length - 4]) | |||
| ans = chat_mdl.chat(prompt, [{"role": "user", "content": contents}], {"temperature": 0.01}) | |||
| if ans.lower().find("yes") >= 0: return True | |||
| return False | |||
| def rewrite(tenant_id, llm_id, question): | |||
| if llm_id2llm_type(llm_id) == "image2text": | |||
| chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id) | |||
| else: | |||
| chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id) | |||
| prompt = """ | |||
| You are an expert at query expansion to generate a paraphrasing of a question. | |||
| I can't retrieval relevant information from the knowledge base by using user's question directly. | |||
| You need to expand or paraphrase user's question by multiple ways such as using synonyms words/phrase, | |||
| writing the abbreviation in its entirety, adding some extra descriptions or explanations, | |||
| changing the way of expression, translating the original question into another language (English/Chinese), etc. | |||
| And return 5 versions of question and one is from translation. | |||
| Just list the question. No other words are needed. | |||
| """ | |||
| ans = chat_mdl.chat(prompt, [{"role": "user", "content": question}], {"temperature": 0.8}) | |||
| return ans | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import json | |||
| import re | |||
| from copy import deepcopy | |||
| from api.db import LLMType, ParserType | |||
| from api.db.db_models import Dialog, Conversation | |||
| from api.db.services.common_service import CommonService | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle | |||
| from api.settings import chat_logger, retrievaler, kg_retrievaler | |||
| from rag.app.resume import forbidden_select_fields4resume | |||
| from rag.nlp import keyword_extraction | |||
| from rag.nlp.search import index_name | |||
| from rag.utils import rmSpace, num_tokens_from_string, encoder | |||
| from api.utils.file_utils import get_project_base_directory | |||
| class DialogService(CommonService): | |||
| model = Dialog | |||
| class ConversationService(CommonService): | |||
| model = Conversation | |||
| def message_fit_in(msg, max_length=4000): | |||
| def count(): | |||
| nonlocal msg | |||
| tks_cnts = [] | |||
| for m in msg: | |||
| tks_cnts.append( | |||
| {"role": m["role"], "count": num_tokens_from_string(m["content"])}) | |||
| total = 0 | |||
| for m in tks_cnts: | |||
| total += m["count"] | |||
| return total | |||
| c = count() | |||
| if c < max_length: | |||
| return c, msg | |||
| msg_ = [m for m in msg[:-1] if m["role"] == "system"] | |||
| msg_.append(msg[-1]) | |||
| msg = msg_ | |||
| c = count() | |||
| if c < max_length: | |||
| return c, msg | |||
| ll = num_tokens_from_string(msg_[0]["content"]) | |||
| l = num_tokens_from_string(msg_[-1]["content"]) | |||
| if ll / (ll + l) > 0.8: | |||
| m = msg_[0]["content"] | |||
| m = encoder.decode(encoder.encode(m)[:max_length - l]) | |||
| msg[0]["content"] = m | |||
| return max_length, msg | |||
| m = msg_[1]["content"] | |||
| m = encoder.decode(encoder.encode(m)[:max_length - l]) | |||
| msg[1]["content"] = m | |||
| return max_length, msg | |||
| def llm_id2llm_type(llm_id): | |||
| fnm = os.path.join(get_project_base_directory(), "conf") | |||
| llm_factories = json.load(open(os.path.join(fnm, "llm_factories.json"), "r")) | |||
| for llm_factory in llm_factories["factory_llm_infos"]: | |||
| for llm in llm_factory["llm"]: | |||
| if llm_id == llm["llm_name"]: | |||
| return llm["model_type"].strip(",")[-1] | |||
| def chat(dialog, messages, stream=True, **kwargs): | |||
| assert messages[-1]["role"] == "user", "The last content of this conversation is not from user." | |||
| llm = LLMService.query(llm_name=dialog.llm_id) | |||
| if not llm: | |||
| llm = TenantLLMService.query(tenant_id=dialog.tenant_id, llm_name=dialog.llm_id) | |||
| if not llm: | |||
| raise LookupError("LLM(%s) not found" % dialog.llm_id) | |||
| max_tokens = 8192 | |||
| else: | |||
| max_tokens = llm[0].max_tokens | |||
| kbs = KnowledgebaseService.get_by_ids(dialog.kb_ids) | |||
| embd_nms = list(set([kb.embd_id for kb in kbs])) | |||
| if len(embd_nms) != 1: | |||
| yield {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []} | |||
| return {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []} | |||
| is_kg = all([kb.parser_id == ParserType.KG for kb in kbs]) | |||
| retr = retrievaler if not is_kg else kg_retrievaler | |||
| questions = [m["content"] for m in messages if m["role"] == "user"][-3:] | |||
| attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None | |||
| if "doc_ids" in messages[-1]: | |||
| attachments = messages[-1]["doc_ids"] | |||
| for m in messages[:-1]: | |||
| if "doc_ids" in m: | |||
| attachments.extend(m["doc_ids"]) | |||
| embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0]) | |||
| if llm_id2llm_type(dialog.llm_id) == "image2text": | |||
| chat_mdl = LLMBundle(dialog.tenant_id, LLMType.IMAGE2TEXT, dialog.llm_id) | |||
| else: | |||
| chat_mdl = LLMBundle(dialog.tenant_id, LLMType.CHAT, dialog.llm_id) | |||
| prompt_config = dialog.prompt_config | |||
| field_map = KnowledgebaseService.get_field_map(dialog.kb_ids) | |||
| # try to use sql if field mapping is good to go | |||
| if field_map: | |||
| chat_logger.info("Use SQL to retrieval:{}".format(questions[-1])) | |||
| ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True)) | |||
| if ans: | |||
| yield ans | |||
| return | |||
| for p in prompt_config["parameters"]: | |||
| if p["key"] == "knowledge": | |||
| continue | |||
| if p["key"] not in kwargs and not p["optional"]: | |||
| raise KeyError("Miss parameter: " + p["key"]) | |||
| if p["key"] not in kwargs: | |||
| prompt_config["system"] = prompt_config["system"].replace( | |||
| "{%s}" % p["key"], " ") | |||
| rerank_mdl = None | |||
| if dialog.rerank_id: | |||
| rerank_mdl = LLMBundle(dialog.tenant_id, LLMType.RERANK, dialog.rerank_id) | |||
| for _ in range(len(questions) // 2): | |||
| questions.append(questions[-1]) | |||
| if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]: | |||
| kbinfos = {"total": 0, "chunks": [], "doc_aggs": []} | |||
| else: | |||
| if prompt_config.get("keyword", False): | |||
| questions[-1] += keyword_extraction(chat_mdl, questions[-1]) | |||
| kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, | |||
| dialog.similarity_threshold, | |||
| dialog.vector_similarity_weight, | |||
| doc_ids=attachments, | |||
| top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl) | |||
| knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] | |||
| #self-rag | |||
| if dialog.prompt_config.get("self_rag") and not relevant(dialog.tenant_id, dialog.llm_id, questions[-1], knowledges): | |||
| questions[-1] = rewrite(dialog.tenant_id, dialog.llm_id, questions[-1]) | |||
| kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, | |||
| dialog.similarity_threshold, | |||
| dialog.vector_similarity_weight, | |||
| doc_ids=attachments, | |||
| top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl) | |||
| knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] | |||
| chat_logger.info( | |||
| "{}->{}".format(" ".join(questions), "\n->".join(knowledges))) | |||
| if not knowledges and prompt_config.get("empty_response"): | |||
| yield {"answer": prompt_config["empty_response"], "reference": kbinfos} | |||
| return {"answer": prompt_config["empty_response"], "reference": kbinfos} | |||
| kwargs["knowledge"] = "\n".join(knowledges) | |||
| gen_conf = dialog.llm_setting | |||
| msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}] | |||
| msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])} | |||
| for m in messages if m["role"] != "system"]) | |||
| used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.97)) | |||
| assert len(msg) >= 2, f"message_fit_in has bug: {msg}" | |||
| if "max_tokens" in gen_conf: | |||
| gen_conf["max_tokens"] = min( | |||
| gen_conf["max_tokens"], | |||
| max_tokens - used_token_count) | |||
| def decorate_answer(answer): | |||
| nonlocal prompt_config, knowledges, kwargs, kbinfos | |||
| refs = [] | |||
| if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): | |||
| answer, idx = retr.insert_citations(answer, | |||
| [ck["content_ltks"] | |||
| for ck in kbinfos["chunks"]], | |||
| [ck["vector"] | |||
| for ck in kbinfos["chunks"]], | |||
| embd_mdl, | |||
| tkweight=1 - dialog.vector_similarity_weight, | |||
| vtweight=dialog.vector_similarity_weight) | |||
| idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) | |||
| recall_docs = [ | |||
| d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] | |||
| if not recall_docs: recall_docs = kbinfos["doc_aggs"] | |||
| kbinfos["doc_aggs"] = recall_docs | |||
| refs = deepcopy(kbinfos) | |||
| for c in refs["chunks"]: | |||
| if c.get("vector"): | |||
| del c["vector"] | |||
| if answer.lower().find("invalid key") >= 0 or answer.lower().find("invalid api") >= 0: | |||
| answer += " Please set LLM API-Key in 'User Setting -> Model Providers -> API-Key'" | |||
| return {"answer": answer, "reference": refs} | |||
| if stream: | |||
| answer = "" | |||
| for ans in chat_mdl.chat_streamly(msg[0]["content"], msg[1:], gen_conf): | |||
| answer = ans | |||
| yield {"answer": answer, "reference": {}} | |||
| yield decorate_answer(answer) | |||
| else: | |||
| answer = chat_mdl.chat( | |||
| msg[0]["content"], msg[1:], gen_conf) | |||
| chat_logger.info("User: {}|Assistant: {}".format( | |||
| msg[-1]["content"], answer)) | |||
| yield decorate_answer(answer) | |||
| def use_sql(question, field_map, tenant_id, chat_mdl, quota=True): | |||
| sys_prompt = "你是一个DBA。你需要这对以下表的字段结构,根据用户的问题列表,写出最后一个问题对应的SQL。" | |||
| user_promt = """ | |||
| 表名:{}; | |||
| 数据库表字段说明如下: | |||
| {} | |||
| 问题如下: | |||
| {} | |||
| 请写出SQL, 且只要SQL,不要有其他说明及文字。 | |||
| """.format( | |||
| index_name(tenant_id), | |||
| "\n".join([f"{k}: {v}" for k, v in field_map.items()]), | |||
| question | |||
| ) | |||
| tried_times = 0 | |||
| def get_table(): | |||
| nonlocal sys_prompt, user_promt, question, tried_times | |||
| sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], { | |||
| "temperature": 0.06}) | |||
| print(user_promt, sql) | |||
| chat_logger.info(f"“{question}”==>{user_promt} get SQL: {sql}") | |||
| sql = re.sub(r"[\r\n]+", " ", sql.lower()) | |||
| sql = re.sub(r".*select ", "select ", sql.lower()) | |||
| sql = re.sub(r" +", " ", sql) | |||
| sql = re.sub(r"([;;]|```).*", "", sql) | |||
| if sql[:len("select ")] != "select ": | |||
| return None, None | |||
| if not re.search(r"((sum|avg|max|min)\(|group by )", sql.lower()): | |||
| if sql[:len("select *")] != "select *": | |||
| sql = "select doc_id,docnm_kwd," + sql[6:] | |||
| else: | |||
| flds = [] | |||
| for k in field_map.keys(): | |||
| if k in forbidden_select_fields4resume: | |||
| continue | |||
| if len(flds) > 11: | |||
| break | |||
| flds.append(k) | |||
| sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:] | |||
| print(f"“{question}” get SQL(refined): {sql}") | |||
| chat_logger.info(f"“{question}” get SQL(refined): {sql}") | |||
| tried_times += 1 | |||
| return retrievaler.sql_retrieval(sql, format="json"), sql | |||
| tbl, sql = get_table() | |||
| if tbl is None: | |||
| return None | |||
| if tbl.get("error") and tried_times <= 2: | |||
| user_promt = """ | |||
| 表名:{}; | |||
| 数据库表字段说明如下: | |||
| {} | |||
| 问题如下: | |||
| {} | |||
| 你上一次给出的错误SQL如下: | |||
| {} | |||
| 后台报错如下: | |||
| {} | |||
| 请纠正SQL中的错误再写一遍,且只要SQL,不要有其他说明及文字。 | |||
| """.format( | |||
| index_name(tenant_id), | |||
| "\n".join([f"{k}: {v}" for k, v in field_map.items()]), | |||
| question, sql, tbl["error"] | |||
| ) | |||
| tbl, sql = get_table() | |||
| chat_logger.info("TRY it again: {}".format(sql)) | |||
| chat_logger.info("GET table: {}".format(tbl)) | |||
| print(tbl) | |||
| if tbl.get("error") or len(tbl["rows"]) == 0: | |||
| return None | |||
| docid_idx = set([ii for ii, c in enumerate( | |||
| tbl["columns"]) if c["name"] == "doc_id"]) | |||
| docnm_idx = set([ii for ii, c in enumerate( | |||
| tbl["columns"]) if c["name"] == "docnm_kwd"]) | |||
| clmn_idx = [ii for ii in range( | |||
| len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)] | |||
| # compose markdown table | |||
| clmns = "|" + "|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], | |||
| tbl["columns"][i]["name"])) for i in | |||
| clmn_idx]) + ("|Source|" if docid_idx and docid_idx else "|") | |||
| line = "|" + "|".join(["------" for _ in range(len(clmn_idx))]) + \ | |||
| ("|------|" if docid_idx and docid_idx else "") | |||
| rows = ["|" + | |||
| "|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + | |||
| "|" for r in tbl["rows"]] | |||
| if quota: | |||
| rows = "\n".join([r + f" ##{ii}$$ |" for ii, r in enumerate(rows)]) | |||
| else: | |||
| rows = "\n".join([r + f" ##{ii}$$ |" for ii, r in enumerate(rows)]) | |||
| rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows) | |||
| if not docid_idx or not docnm_idx: | |||
| chat_logger.warning("SQL missing field: " + sql) | |||
| return { | |||
| "answer": "\n".join([clmns, line, rows]), | |||
| "reference": {"chunks": [], "doc_aggs": []} | |||
| } | |||
| docid_idx = list(docid_idx)[0] | |||
| docnm_idx = list(docnm_idx)[0] | |||
| doc_aggs = {} | |||
| for r in tbl["rows"]: | |||
| if r[docid_idx] not in doc_aggs: | |||
| doc_aggs[r[docid_idx]] = {"doc_name": r[docnm_idx], "count": 0} | |||
| doc_aggs[r[docid_idx]]["count"] += 1 | |||
| return { | |||
| "answer": "\n".join([clmns, line, rows]), | |||
| "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]], | |||
| "doc_aggs": [{"doc_id": did, "doc_name": d["doc_name"], "count": d["count"]} for did, d in | |||
| doc_aggs.items()]} | |||
| } | |||
| def relevant(tenant_id, llm_id, question, contents: list): | |||
| if llm_id2llm_type(llm_id) == "image2text": | |||
| chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id) | |||
| else: | |||
| chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id) | |||
| prompt = """ | |||
| You are a grader assessing relevance of a retrieved document to a user question. | |||
| It does not need to be a stringent test. The goal is to filter out erroneous retrievals. | |||
| If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. | |||
| Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. | |||
| No other words needed except 'yes' or 'no'. | |||
| """ | |||
| if not contents:return False | |||
| contents = "Documents: \n" + " - ".join(contents) | |||
| contents = f"Question: {question}\n" + contents | |||
| if num_tokens_from_string(contents) >= chat_mdl.max_length - 4: | |||
| contents = encoder.decode(encoder.encode(contents)[:chat_mdl.max_length - 4]) | |||
| ans = chat_mdl.chat(prompt, [{"role": "user", "content": contents}], {"temperature": 0.01}) | |||
| if ans.lower().find("yes") >= 0: return True | |||
| return False | |||
| def rewrite(tenant_id, llm_id, question): | |||
| if llm_id2llm_type(llm_id) == "image2text": | |||
| chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id) | |||
| else: | |||
| chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id) | |||
| prompt = """ | |||
| You are an expert at query expansion to generate a paraphrasing of a question. | |||
| I can't retrieval relevant information from the knowledge base by using user's question directly. | |||
| You need to expand or paraphrase user's question by multiple ways such as using synonyms words/phrase, | |||
| writing the abbreviation in its entirety, adding some extra descriptions or explanations, | |||
| changing the way of expression, translating the original question into another language (English/Chinese), etc. | |||
| And return 5 versions of question and one is from translation. | |||
| Just list the question. No other words are needed. | |||
| """ | |||
| ans = chat_mdl.chat(prompt, [{"role": "user", "content": question}], {"temperature": 0.8}) | |||
| return ans | |||
| @@ -1,382 +1,382 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import random | |||
| from datetime import datetime | |||
| from elasticsearch_dsl import Q | |||
| from peewee import fn | |||
| from api.db.db_utils import bulk_insert_into_db | |||
| from api.settings import stat_logger | |||
| from api.utils import current_timestamp, get_format_time, get_uuid | |||
| from rag.settings import SVR_QUEUE_NAME | |||
| from rag.utils.es_conn import ELASTICSEARCH | |||
| from rag.utils.minio_conn import MINIO | |||
| from rag.nlp import search | |||
| from api.db import FileType, TaskStatus, ParserType | |||
| from api.db.db_models import DB, Knowledgebase, Tenant, Task | |||
| from api.db.db_models import Document | |||
| from api.db.services.common_service import CommonService | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db import StatusEnum | |||
| from rag.utils.redis_conn import REDIS_CONN | |||
| class DocumentService(CommonService): | |||
| model = Document | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_kb_id(cls, kb_id, page_number, items_per_page, | |||
| orderby, desc, keywords): | |||
| if keywords: | |||
| docs = cls.model.select().where( | |||
| (cls.model.kb_id == kb_id), | |||
| (fn.LOWER(cls.model.name).contains(keywords.lower())) | |||
| ) | |||
| else: | |||
| docs = cls.model.select().where(cls.model.kb_id == kb_id) | |||
| count = docs.count() | |||
| if desc: | |||
| docs = docs.order_by(cls.model.getter_by(orderby).desc()) | |||
| else: | |||
| docs = docs.order_by(cls.model.getter_by(orderby).asc()) | |||
| docs = docs.paginate(page_number, items_per_page) | |||
| return list(docs.dicts()), count | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def list_documents_in_dataset(cls, dataset_id, offset, count, order_by, descend, keywords): | |||
| if keywords: | |||
| docs = cls.model.select().where( | |||
| (cls.model.kb_id == dataset_id), | |||
| (fn.LOWER(cls.model.name).contains(keywords.lower())) | |||
| ) | |||
| else: | |||
| docs = cls.model.select().where(cls.model.kb_id == dataset_id) | |||
| total = docs.count() | |||
| if descend == 'True': | |||
| docs = docs.order_by(cls.model.getter_by(order_by).desc()) | |||
| if descend == 'False': | |||
| docs = docs.order_by(cls.model.getter_by(order_by).asc()) | |||
| docs = list(docs.dicts()) | |||
| docs_length = len(docs) | |||
| if offset < 0 or offset > docs_length: | |||
| raise IndexError("Offset is out of the valid range.") | |||
| if count == -1: | |||
| return docs[offset:], total | |||
| return docs[offset:offset + count], total | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def insert(cls, doc): | |||
| if not cls.save(**doc): | |||
| raise RuntimeError("Database error (Document)!") | |||
| e, doc = cls.get_by_id(doc["id"]) | |||
| if not e: | |||
| raise RuntimeError("Database error (Document retrieval)!") | |||
| e, kb = KnowledgebaseService.get_by_id(doc.kb_id) | |||
| if not KnowledgebaseService.update_by_id( | |||
| kb.id, {"doc_num": kb.doc_num + 1}): | |||
| raise RuntimeError("Database error (Knowledgebase)!") | |||
| return doc | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def remove_document(cls, doc, tenant_id): | |||
| ELASTICSEARCH.deleteByQuery( | |||
| Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) | |||
| cls.clear_chunk_num(doc.id) | |||
| return cls.delete_by_id(doc.id) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_newly_uploaded(cls): | |||
| fields = [ | |||
| cls.model.id, | |||
| cls.model.kb_id, | |||
| cls.model.parser_id, | |||
| cls.model.parser_config, | |||
| cls.model.name, | |||
| cls.model.type, | |||
| cls.model.location, | |||
| cls.model.size, | |||
| Knowledgebase.tenant_id, | |||
| Tenant.embd_id, | |||
| Tenant.img2txt_id, | |||
| Tenant.asr_id, | |||
| cls.model.update_time] | |||
| docs = cls.model.select(*fields) \ | |||
| .join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id)) \ | |||
| .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\ | |||
| .where( | |||
| cls.model.status == StatusEnum.VALID.value, | |||
| ~(cls.model.type == FileType.VIRTUAL.value), | |||
| cls.model.progress == 0, | |||
| cls.model.update_time >= current_timestamp() - 1000 * 600, | |||
| cls.model.run == TaskStatus.RUNNING.value)\ | |||
| .order_by(cls.model.update_time.asc()) | |||
| return list(docs.dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_unfinished_docs(cls): | |||
| fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg, cls.model.run] | |||
| docs = cls.model.select(*fields) \ | |||
| .where( | |||
| cls.model.status == StatusEnum.VALID.value, | |||
| ~(cls.model.type == FileType.VIRTUAL.value), | |||
| cls.model.progress < 1, | |||
| cls.model.progress > 0) | |||
| return list(docs.dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def increment_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duation): | |||
| num = cls.model.update(token_num=cls.model.token_num + token_num, | |||
| chunk_num=cls.model.chunk_num + chunk_num, | |||
| process_duation=cls.model.process_duation + duation).where( | |||
| cls.model.id == doc_id).execute() | |||
| if num == 0: | |||
| raise LookupError( | |||
| "Document not found which is supposed to be there") | |||
| num = Knowledgebase.update( | |||
| token_num=Knowledgebase.token_num + | |||
| token_num, | |||
| chunk_num=Knowledgebase.chunk_num + | |||
| chunk_num).where( | |||
| Knowledgebase.id == kb_id).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def decrement_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duation): | |||
| num = cls.model.update(token_num=cls.model.token_num - token_num, | |||
| chunk_num=cls.model.chunk_num - chunk_num, | |||
| process_duation=cls.model.process_duation + duation).where( | |||
| cls.model.id == doc_id).execute() | |||
| if num == 0: | |||
| raise LookupError( | |||
| "Document not found which is supposed to be there") | |||
| num = Knowledgebase.update( | |||
| token_num=Knowledgebase.token_num - | |||
| token_num, | |||
| chunk_num=Knowledgebase.chunk_num - | |||
| chunk_num | |||
| ).where( | |||
| Knowledgebase.id == kb_id).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def clear_chunk_num(cls, doc_id): | |||
| doc = cls.model.get_by_id(doc_id) | |||
| assert doc, "Can't fine document in database." | |||
| num = Knowledgebase.update( | |||
| token_num=Knowledgebase.token_num - | |||
| doc.token_num, | |||
| chunk_num=Knowledgebase.chunk_num - | |||
| doc.chunk_num, | |||
| doc_num=Knowledgebase.doc_num-1 | |||
| ).where( | |||
| Knowledgebase.id == doc.kb_id).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_tenant_id(cls, doc_id): | |||
| docs = cls.model.select( | |||
| Knowledgebase.tenant_id).join( | |||
| Knowledgebase, on=( | |||
| Knowledgebase.id == cls.model.kb_id)).where( | |||
| cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value) | |||
| docs = docs.dicts() | |||
| if not docs: | |||
| return | |||
| return docs[0]["tenant_id"] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_tenant_id_by_name(cls, name): | |||
| docs = cls.model.select( | |||
| Knowledgebase.tenant_id).join( | |||
| Knowledgebase, on=( | |||
| Knowledgebase.id == cls.model.kb_id)).where( | |||
| cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value) | |||
| docs = docs.dicts() | |||
| if not docs: | |||
| return | |||
| return docs[0]["tenant_id"] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_embd_id(cls, doc_id): | |||
| docs = cls.model.select( | |||
| Knowledgebase.embd_id).join( | |||
| Knowledgebase, on=( | |||
| Knowledgebase.id == cls.model.kb_id)).where( | |||
| cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value) | |||
| docs = docs.dicts() | |||
| if not docs: | |||
| return | |||
| return docs[0]["embd_id"] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_doc_id_by_doc_name(cls, doc_name): | |||
| fields = [cls.model.id] | |||
| doc_id = cls.model.select(*fields) \ | |||
| .where(cls.model.name == doc_name) | |||
| doc_id = doc_id.dicts() | |||
| if not doc_id: | |||
| return | |||
| return doc_id[0]["id"] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_thumbnails(cls, docids): | |||
| fields = [cls.model.id, cls.model.thumbnail] | |||
| return list(cls.model.select( | |||
| *fields).where(cls.model.id.in_(docids)).dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_parser_config(cls, id, config): | |||
| e, d = cls.get_by_id(id) | |||
| if not e: | |||
| raise LookupError(f"Document({id}) not found.") | |||
| def dfs_update(old, new): | |||
| for k, v in new.items(): | |||
| if k not in old: | |||
| old[k] = v | |||
| continue | |||
| if isinstance(v, dict): | |||
| assert isinstance(old[k], dict) | |||
| dfs_update(old[k], v) | |||
| else: | |||
| old[k] = v | |||
| dfs_update(d.parser_config, config) | |||
| cls.update_by_id(id, {"parser_config": d.parser_config}) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_doc_count(cls, tenant_id): | |||
| docs = cls.model.select(cls.model.id).join(Knowledgebase, | |||
| on=(Knowledgebase.id == cls.model.kb_id)).where( | |||
| Knowledgebase.tenant_id == tenant_id) | |||
| return len(docs) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def begin2parse(cls, docid): | |||
| cls.update_by_id( | |||
| docid, {"progress": random.random() * 1 / 100., | |||
| "progress_msg": "Task dispatched...", | |||
| "process_begin_at": get_format_time() | |||
| }) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_progress(cls): | |||
| docs = cls.get_unfinished_docs() | |||
| for d in docs: | |||
| try: | |||
| tsks = Task.query(doc_id=d["id"], order_by=Task.create_time) | |||
| if not tsks: | |||
| continue | |||
| msg = [] | |||
| prg = 0 | |||
| finished = True | |||
| bad = 0 | |||
| e, doc = DocumentService.get_by_id(d["id"]) | |||
| status = doc.run#TaskStatus.RUNNING.value | |||
| for t in tsks: | |||
| if 0 <= t.progress < 1: | |||
| finished = False | |||
| prg += t.progress if t.progress >= 0 else 0 | |||
| if t.progress_msg not in msg: | |||
| msg.append(t.progress_msg) | |||
| if t.progress == -1: | |||
| bad += 1 | |||
| prg /= len(tsks) | |||
| if finished and bad: | |||
| prg = -1 | |||
| status = TaskStatus.FAIL.value | |||
| elif finished: | |||
| if d["parser_config"].get("raptor", {}).get("use_raptor") and d["progress_msg"].lower().find(" raptor")<0: | |||
| queue_raptor_tasks(d) | |||
| prg *= 0.98 | |||
| msg.append("------ RAPTOR -------") | |||
| else: | |||
| status = TaskStatus.DONE.value | |||
| msg = "\n".join(msg) | |||
| info = { | |||
| "process_duation": datetime.timestamp( | |||
| datetime.now()) - | |||
| d["process_begin_at"].timestamp(), | |||
| "run": status} | |||
| if prg != 0: | |||
| info["progress"] = prg | |||
| if msg: | |||
| info["progress_msg"] = msg | |||
| cls.update_by_id(d["id"], info) | |||
| except Exception as e: | |||
| stat_logger.error("fetch task exception:" + str(e)) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_kb_doc_count(cls, kb_id): | |||
| return len(cls.model.select(cls.model.id).where( | |||
| cls.model.kb_id == kb_id).dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def do_cancel(cls, doc_id): | |||
| try: | |||
| _, doc = DocumentService.get_by_id(doc_id) | |||
| return doc.run == TaskStatus.CANCEL.value or doc.progress < 0 | |||
| except Exception as e: | |||
| pass | |||
| return False | |||
| def queue_raptor_tasks(doc): | |||
| def new_task(): | |||
| nonlocal doc | |||
| return { | |||
| "id": get_uuid(), | |||
| "doc_id": doc["id"], | |||
| "from_page": 0, | |||
| "to_page": -1, | |||
| "progress_msg": "Start to do RAPTOR (Recursive Abstractive Processing For Tree-Organized Retrieval)." | |||
| } | |||
| task = new_task() | |||
| bulk_insert_into_db(Task, [task], True) | |||
| task["type"] = "raptor" | |||
| assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=task), "Can't access Redis. Please check the Redis' status." | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import random | |||
| from datetime import datetime | |||
| from elasticsearch_dsl import Q | |||
| from peewee import fn | |||
| from api.db.db_utils import bulk_insert_into_db | |||
| from api.settings import stat_logger | |||
| from api.utils import current_timestamp, get_format_time, get_uuid | |||
| from rag.settings import SVR_QUEUE_NAME | |||
| from rag.utils.es_conn import ELASTICSEARCH | |||
| from rag.utils.minio_conn import MINIO | |||
| from rag.nlp import search | |||
| from api.db import FileType, TaskStatus, ParserType | |||
| from api.db.db_models import DB, Knowledgebase, Tenant, Task | |||
| from api.db.db_models import Document | |||
| from api.db.services.common_service import CommonService | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db import StatusEnum | |||
| from rag.utils.redis_conn import REDIS_CONN | |||
| class DocumentService(CommonService): | |||
| model = Document | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_kb_id(cls, kb_id, page_number, items_per_page, | |||
| orderby, desc, keywords): | |||
| if keywords: | |||
| docs = cls.model.select().where( | |||
| (cls.model.kb_id == kb_id), | |||
| (fn.LOWER(cls.model.name).contains(keywords.lower())) | |||
| ) | |||
| else: | |||
| docs = cls.model.select().where(cls.model.kb_id == kb_id) | |||
| count = docs.count() | |||
| if desc: | |||
| docs = docs.order_by(cls.model.getter_by(orderby).desc()) | |||
| else: | |||
| docs = docs.order_by(cls.model.getter_by(orderby).asc()) | |||
| docs = docs.paginate(page_number, items_per_page) | |||
| return list(docs.dicts()), count | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def list_documents_in_dataset(cls, dataset_id, offset, count, order_by, descend, keywords): | |||
| if keywords: | |||
| docs = cls.model.select().where( | |||
| (cls.model.kb_id == dataset_id), | |||
| (fn.LOWER(cls.model.name).contains(keywords.lower())) | |||
| ) | |||
| else: | |||
| docs = cls.model.select().where(cls.model.kb_id == dataset_id) | |||
| total = docs.count() | |||
| if descend == 'True': | |||
| docs = docs.order_by(cls.model.getter_by(order_by).desc()) | |||
| if descend == 'False': | |||
| docs = docs.order_by(cls.model.getter_by(order_by).asc()) | |||
| docs = list(docs.dicts()) | |||
| docs_length = len(docs) | |||
| if offset < 0 or offset > docs_length: | |||
| raise IndexError("Offset is out of the valid range.") | |||
| if count == -1: | |||
| return docs[offset:], total | |||
| return docs[offset:offset + count], total | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def insert(cls, doc): | |||
| if not cls.save(**doc): | |||
| raise RuntimeError("Database error (Document)!") | |||
| e, doc = cls.get_by_id(doc["id"]) | |||
| if not e: | |||
| raise RuntimeError("Database error (Document retrieval)!") | |||
| e, kb = KnowledgebaseService.get_by_id(doc.kb_id) | |||
| if not KnowledgebaseService.update_by_id( | |||
| kb.id, {"doc_num": kb.doc_num + 1}): | |||
| raise RuntimeError("Database error (Knowledgebase)!") | |||
| return doc | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def remove_document(cls, doc, tenant_id): | |||
| ELASTICSEARCH.deleteByQuery( | |||
| Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) | |||
| cls.clear_chunk_num(doc.id) | |||
| return cls.delete_by_id(doc.id) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_newly_uploaded(cls): | |||
| fields = [ | |||
| cls.model.id, | |||
| cls.model.kb_id, | |||
| cls.model.parser_id, | |||
| cls.model.parser_config, | |||
| cls.model.name, | |||
| cls.model.type, | |||
| cls.model.location, | |||
| cls.model.size, | |||
| Knowledgebase.tenant_id, | |||
| Tenant.embd_id, | |||
| Tenant.img2txt_id, | |||
| Tenant.asr_id, | |||
| cls.model.update_time] | |||
| docs = cls.model.select(*fields) \ | |||
| .join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id)) \ | |||
| .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\ | |||
| .where( | |||
| cls.model.status == StatusEnum.VALID.value, | |||
| ~(cls.model.type == FileType.VIRTUAL.value), | |||
| cls.model.progress == 0, | |||
| cls.model.update_time >= current_timestamp() - 1000 * 600, | |||
| cls.model.run == TaskStatus.RUNNING.value)\ | |||
| .order_by(cls.model.update_time.asc()) | |||
| return list(docs.dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_unfinished_docs(cls): | |||
| fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg, cls.model.run] | |||
| docs = cls.model.select(*fields) \ | |||
| .where( | |||
| cls.model.status == StatusEnum.VALID.value, | |||
| ~(cls.model.type == FileType.VIRTUAL.value), | |||
| cls.model.progress < 1, | |||
| cls.model.progress > 0) | |||
| return list(docs.dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def increment_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duation): | |||
| num = cls.model.update(token_num=cls.model.token_num + token_num, | |||
| chunk_num=cls.model.chunk_num + chunk_num, | |||
| process_duation=cls.model.process_duation + duation).where( | |||
| cls.model.id == doc_id).execute() | |||
| if num == 0: | |||
| raise LookupError( | |||
| "Document not found which is supposed to be there") | |||
| num = Knowledgebase.update( | |||
| token_num=Knowledgebase.token_num + | |||
| token_num, | |||
| chunk_num=Knowledgebase.chunk_num + | |||
| chunk_num).where( | |||
| Knowledgebase.id == kb_id).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def decrement_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duation): | |||
| num = cls.model.update(token_num=cls.model.token_num - token_num, | |||
| chunk_num=cls.model.chunk_num - chunk_num, | |||
| process_duation=cls.model.process_duation + duation).where( | |||
| cls.model.id == doc_id).execute() | |||
| if num == 0: | |||
| raise LookupError( | |||
| "Document not found which is supposed to be there") | |||
| num = Knowledgebase.update( | |||
| token_num=Knowledgebase.token_num - | |||
| token_num, | |||
| chunk_num=Knowledgebase.chunk_num - | |||
| chunk_num | |||
| ).where( | |||
| Knowledgebase.id == kb_id).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def clear_chunk_num(cls, doc_id): | |||
| doc = cls.model.get_by_id(doc_id) | |||
| assert doc, "Can't fine document in database." | |||
| num = Knowledgebase.update( | |||
| token_num=Knowledgebase.token_num - | |||
| doc.token_num, | |||
| chunk_num=Knowledgebase.chunk_num - | |||
| doc.chunk_num, | |||
| doc_num=Knowledgebase.doc_num-1 | |||
| ).where( | |||
| Knowledgebase.id == doc.kb_id).execute() | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_tenant_id(cls, doc_id): | |||
| docs = cls.model.select( | |||
| Knowledgebase.tenant_id).join( | |||
| Knowledgebase, on=( | |||
| Knowledgebase.id == cls.model.kb_id)).where( | |||
| cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value) | |||
| docs = docs.dicts() | |||
| if not docs: | |||
| return | |||
| return docs[0]["tenant_id"] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_tenant_id_by_name(cls, name): | |||
| docs = cls.model.select( | |||
| Knowledgebase.tenant_id).join( | |||
| Knowledgebase, on=( | |||
| Knowledgebase.id == cls.model.kb_id)).where( | |||
| cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value) | |||
| docs = docs.dicts() | |||
| if not docs: | |||
| return | |||
| return docs[0]["tenant_id"] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_embd_id(cls, doc_id): | |||
| docs = cls.model.select( | |||
| Knowledgebase.embd_id).join( | |||
| Knowledgebase, on=( | |||
| Knowledgebase.id == cls.model.kb_id)).where( | |||
| cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value) | |||
| docs = docs.dicts() | |||
| if not docs: | |||
| return | |||
| return docs[0]["embd_id"] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_doc_id_by_doc_name(cls, doc_name): | |||
| fields = [cls.model.id] | |||
| doc_id = cls.model.select(*fields) \ | |||
| .where(cls.model.name == doc_name) | |||
| doc_id = doc_id.dicts() | |||
| if not doc_id: | |||
| return | |||
| return doc_id[0]["id"] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_thumbnails(cls, docids): | |||
| fields = [cls.model.id, cls.model.thumbnail] | |||
| return list(cls.model.select( | |||
| *fields).where(cls.model.id.in_(docids)).dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_parser_config(cls, id, config): | |||
| e, d = cls.get_by_id(id) | |||
| if not e: | |||
| raise LookupError(f"Document({id}) not found.") | |||
| def dfs_update(old, new): | |||
| for k, v in new.items(): | |||
| if k not in old: | |||
| old[k] = v | |||
| continue | |||
| if isinstance(v, dict): | |||
| assert isinstance(old[k], dict) | |||
| dfs_update(old[k], v) | |||
| else: | |||
| old[k] = v | |||
| dfs_update(d.parser_config, config) | |||
| cls.update_by_id(id, {"parser_config": d.parser_config}) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_doc_count(cls, tenant_id): | |||
| docs = cls.model.select(cls.model.id).join(Knowledgebase, | |||
| on=(Knowledgebase.id == cls.model.kb_id)).where( | |||
| Knowledgebase.tenant_id == tenant_id) | |||
| return len(docs) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def begin2parse(cls, docid): | |||
| cls.update_by_id( | |||
| docid, {"progress": random.random() * 1 / 100., | |||
| "progress_msg": "Task dispatched...", | |||
| "process_begin_at": get_format_time() | |||
| }) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_progress(cls): | |||
| docs = cls.get_unfinished_docs() | |||
| for d in docs: | |||
| try: | |||
| tsks = Task.query(doc_id=d["id"], order_by=Task.create_time) | |||
| if not tsks: | |||
| continue | |||
| msg = [] | |||
| prg = 0 | |||
| finished = True | |||
| bad = 0 | |||
| e, doc = DocumentService.get_by_id(d["id"]) | |||
| status = doc.run#TaskStatus.RUNNING.value | |||
| for t in tsks: | |||
| if 0 <= t.progress < 1: | |||
| finished = False | |||
| prg += t.progress if t.progress >= 0 else 0 | |||
| if t.progress_msg not in msg: | |||
| msg.append(t.progress_msg) | |||
| if t.progress == -1: | |||
| bad += 1 | |||
| prg /= len(tsks) | |||
| if finished and bad: | |||
| prg = -1 | |||
| status = TaskStatus.FAIL.value | |||
| elif finished: | |||
| if d["parser_config"].get("raptor", {}).get("use_raptor") and d["progress_msg"].lower().find(" raptor")<0: | |||
| queue_raptor_tasks(d) | |||
| prg *= 0.98 | |||
| msg.append("------ RAPTOR -------") | |||
| else: | |||
| status = TaskStatus.DONE.value | |||
| msg = "\n".join(msg) | |||
| info = { | |||
| "process_duation": datetime.timestamp( | |||
| datetime.now()) - | |||
| d["process_begin_at"].timestamp(), | |||
| "run": status} | |||
| if prg != 0: | |||
| info["progress"] = prg | |||
| if msg: | |||
| info["progress_msg"] = msg | |||
| cls.update_by_id(d["id"], info) | |||
| except Exception as e: | |||
| stat_logger.error("fetch task exception:" + str(e)) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_kb_doc_count(cls, kb_id): | |||
| return len(cls.model.select(cls.model.id).where( | |||
| cls.model.kb_id == kb_id).dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def do_cancel(cls, doc_id): | |||
| try: | |||
| _, doc = DocumentService.get_by_id(doc_id) | |||
| return doc.run == TaskStatus.CANCEL.value or doc.progress < 0 | |||
| except Exception as e: | |||
| pass | |||
| return False | |||
| def queue_raptor_tasks(doc): | |||
| def new_task(): | |||
| nonlocal doc | |||
| return { | |||
| "id": get_uuid(), | |||
| "doc_id": doc["id"], | |||
| "from_page": 0, | |||
| "to_page": -1, | |||
| "progress_msg": "Start to do RAPTOR (Recursive Abstractive Processing For Tree-Organized Retrieval)." | |||
| } | |||
| task = new_task() | |||
| bulk_insert_into_db(Task, [task], True) | |||
| task["type"] = "raptor" | |||
| assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=task), "Can't access Redis. Please check the Redis' status." | |||
| @@ -1,144 +1,144 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from api.db import StatusEnum, TenantPermission | |||
| from api.db.db_models import Knowledgebase, DB, Tenant | |||
| from api.db.services.common_service import CommonService | |||
| class KnowledgebaseService(CommonService): | |||
| model = Knowledgebase | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_tenant_ids(cls, joined_tenant_ids, user_id, | |||
| page_number, items_per_page, orderby, desc): | |||
| kbs = cls.model.select().where( | |||
| ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission == | |||
| TenantPermission.TEAM.value)) | ( | |||
| cls.model.tenant_id == user_id)) | |||
| & (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if desc: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).desc()) | |||
| else: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).asc()) | |||
| kbs = kbs.paginate(page_number, items_per_page) | |||
| return list(kbs.dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_tenant_ids_by_offset(cls, joined_tenant_ids, user_id, offset, count, orderby, desc): | |||
| kbs = cls.model.select().where( | |||
| ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission == | |||
| TenantPermission.TEAM.value)) | ( | |||
| cls.model.tenant_id == user_id)) | |||
| & (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if desc: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).desc()) | |||
| else: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).asc()) | |||
| kbs = list(kbs.dicts()) | |||
| kbs_length = len(kbs) | |||
| if offset < 0 or offset > kbs_length: | |||
| raise IndexError("Offset is out of the valid range.") | |||
| if count == -1: | |||
| return kbs[offset:] | |||
| return kbs[offset:offset+count] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_detail(cls, kb_id): | |||
| fields = [ | |||
| cls.model.id, | |||
| #Tenant.embd_id, | |||
| cls.model.embd_id, | |||
| cls.model.avatar, | |||
| cls.model.name, | |||
| cls.model.language, | |||
| cls.model.description, | |||
| cls.model.permission, | |||
| cls.model.doc_num, | |||
| cls.model.token_num, | |||
| cls.model.chunk_num, | |||
| cls.model.parser_id, | |||
| cls.model.parser_config] | |||
| kbs = cls.model.select(*fields).join(Tenant, on=( | |||
| (Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where( | |||
| (cls.model.id == kb_id), | |||
| (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if not kbs: | |||
| return | |||
| d = kbs[0].to_dict() | |||
| #d["embd_id"] = kbs[0].tenant.embd_id | |||
| return d | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_parser_config(cls, id, config): | |||
| e, m = cls.get_by_id(id) | |||
| if not e: | |||
| raise LookupError(f"knowledgebase({id}) not found.") | |||
| def dfs_update(old, new): | |||
| for k, v in new.items(): | |||
| if k not in old: | |||
| old[k] = v | |||
| continue | |||
| if isinstance(v, dict): | |||
| assert isinstance(old[k], dict) | |||
| dfs_update(old[k], v) | |||
| elif isinstance(v, list): | |||
| assert isinstance(old[k], list) | |||
| old[k] = list(set(old[k] + v)) | |||
| else: | |||
| old[k] = v | |||
| dfs_update(m.parser_config, config) | |||
| cls.update_by_id(id, {"parser_config": m.parser_config}) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_field_map(cls, ids): | |||
| conf = {} | |||
| for k in cls.get_by_ids(ids): | |||
| if k.parser_config and "field_map" in k.parser_config: | |||
| conf.update(k.parser_config["field_map"]) | |||
| return conf | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_name(cls, kb_name, tenant_id): | |||
| kb = cls.model.select().where( | |||
| (cls.model.name == kb_name) | |||
| & (cls.model.tenant_id == tenant_id) | |||
| & (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if kb: | |||
| return True, kb[0] | |||
| return False, None | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_all_ids(cls): | |||
| return [m["id"] for m in cls.model.select(cls.model.id).dicts()] | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from api.db import StatusEnum, TenantPermission | |||
| from api.db.db_models import Knowledgebase, DB, Tenant | |||
| from api.db.services.common_service import CommonService | |||
| class KnowledgebaseService(CommonService): | |||
| model = Knowledgebase | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_tenant_ids(cls, joined_tenant_ids, user_id, | |||
| page_number, items_per_page, orderby, desc): | |||
| kbs = cls.model.select().where( | |||
| ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission == | |||
| TenantPermission.TEAM.value)) | ( | |||
| cls.model.tenant_id == user_id)) | |||
| & (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if desc: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).desc()) | |||
| else: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).asc()) | |||
| kbs = kbs.paginate(page_number, items_per_page) | |||
| return list(kbs.dicts()) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_tenant_ids_by_offset(cls, joined_tenant_ids, user_id, offset, count, orderby, desc): | |||
| kbs = cls.model.select().where( | |||
| ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission == | |||
| TenantPermission.TEAM.value)) | ( | |||
| cls.model.tenant_id == user_id)) | |||
| & (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if desc: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).desc()) | |||
| else: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).asc()) | |||
| kbs = list(kbs.dicts()) | |||
| kbs_length = len(kbs) | |||
| if offset < 0 or offset > kbs_length: | |||
| raise IndexError("Offset is out of the valid range.") | |||
| if count == -1: | |||
| return kbs[offset:] | |||
| return kbs[offset:offset+count] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_detail(cls, kb_id): | |||
| fields = [ | |||
| cls.model.id, | |||
| #Tenant.embd_id, | |||
| cls.model.embd_id, | |||
| cls.model.avatar, | |||
| cls.model.name, | |||
| cls.model.language, | |||
| cls.model.description, | |||
| cls.model.permission, | |||
| cls.model.doc_num, | |||
| cls.model.token_num, | |||
| cls.model.chunk_num, | |||
| cls.model.parser_id, | |||
| cls.model.parser_config] | |||
| kbs = cls.model.select(*fields).join(Tenant, on=( | |||
| (Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where( | |||
| (cls.model.id == kb_id), | |||
| (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if not kbs: | |||
| return | |||
| d = kbs[0].to_dict() | |||
| #d["embd_id"] = kbs[0].tenant.embd_id | |||
| return d | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_parser_config(cls, id, config): | |||
| e, m = cls.get_by_id(id) | |||
| if not e: | |||
| raise LookupError(f"knowledgebase({id}) not found.") | |||
| def dfs_update(old, new): | |||
| for k, v in new.items(): | |||
| if k not in old: | |||
| old[k] = v | |||
| continue | |||
| if isinstance(v, dict): | |||
| assert isinstance(old[k], dict) | |||
| dfs_update(old[k], v) | |||
| elif isinstance(v, list): | |||
| assert isinstance(old[k], list) | |||
| old[k] = list(set(old[k] + v)) | |||
| else: | |||
| old[k] = v | |||
| dfs_update(m.parser_config, config) | |||
| cls.update_by_id(id, {"parser_config": m.parser_config}) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_field_map(cls, ids): | |||
| conf = {} | |||
| for k in cls.get_by_ids(ids): | |||
| if k.parser_config and "field_map" in k.parser_config: | |||
| conf.update(k.parser_config["field_map"]) | |||
| return conf | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_by_name(cls, kb_name, tenant_id): | |||
| kb = cls.model.select().where( | |||
| (cls.model.name == kb_name) | |||
| & (cls.model.tenant_id == tenant_id) | |||
| & (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if kb: | |||
| return True, kb[0] | |||
| return False, None | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_all_ids(cls): | |||
| return [m["id"] for m in cls.model.select(cls.model.id).dicts()] | |||
| @@ -1,242 +1,242 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from api.db.services.user_service import TenantService | |||
| from api.settings import database_logger | |||
| from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel | |||
| from api.db import LLMType | |||
| from api.db.db_models import DB, UserTenant | |||
| from api.db.db_models import LLMFactories, LLM, TenantLLM | |||
| from api.db.services.common_service import CommonService | |||
| class LLMFactoriesService(CommonService): | |||
| model = LLMFactories | |||
| class LLMService(CommonService): | |||
| model = LLM | |||
| class TenantLLMService(CommonService): | |||
| model = TenantLLM | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_api_key(cls, tenant_id, model_name): | |||
| objs = cls.query(tenant_id=tenant_id, llm_name=model_name) | |||
| if not objs: | |||
| return | |||
| return objs[0] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_my_llms(cls, tenant_id): | |||
| fields = [ | |||
| cls.model.llm_factory, | |||
| LLMFactories.logo, | |||
| LLMFactories.tags, | |||
| cls.model.model_type, | |||
| cls.model.llm_name, | |||
| cls.model.used_tokens | |||
| ] | |||
| objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where( | |||
| cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts() | |||
| return list(objs) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def model_instance(cls, tenant_id, llm_type, | |||
| llm_name=None, lang="Chinese"): | |||
| e, tenant = TenantService.get_by_id(tenant_id) | |||
| if not e: | |||
| raise LookupError("Tenant not found") | |||
| if llm_type == LLMType.EMBEDDING.value: | |||
| mdlnm = tenant.embd_id if not llm_name else llm_name | |||
| elif llm_type == LLMType.SPEECH2TEXT.value: | |||
| mdlnm = tenant.asr_id | |||
| elif llm_type == LLMType.IMAGE2TEXT.value: | |||
| mdlnm = tenant.img2txt_id if not llm_name else llm_name | |||
| elif llm_type == LLMType.CHAT.value: | |||
| mdlnm = tenant.llm_id if not llm_name else llm_name | |||
| elif llm_type == LLMType.RERANK: | |||
| mdlnm = tenant.rerank_id if not llm_name else llm_name | |||
| else: | |||
| assert False, "LLM type error" | |||
| model_config = cls.get_api_key(tenant_id, mdlnm) | |||
| if model_config: model_config = model_config.to_dict() | |||
| if not model_config: | |||
| if llm_type in [LLMType.EMBEDDING, LLMType.RERANK]: | |||
| llm = LLMService.query(llm_name=llm_name if llm_name else mdlnm) | |||
| if llm and llm[0].fid in ["Youdao", "FastEmbed", "BAAI"]: | |||
| model_config = {"llm_factory": llm[0].fid, "api_key":"", "llm_name": llm_name if llm_name else mdlnm, "api_base": ""} | |||
| if not model_config: | |||
| if llm_name == "flag-embedding": | |||
| model_config = {"llm_factory": "Tongyi-Qianwen", "api_key": "", | |||
| "llm_name": llm_name, "api_base": ""} | |||
| else: | |||
| if not mdlnm: | |||
| raise LookupError(f"Type of {llm_type} model is not set.") | |||
| raise LookupError("Model({}) not authorized".format(mdlnm)) | |||
| if llm_type == LLMType.EMBEDDING.value: | |||
| if model_config["llm_factory"] not in EmbeddingModel: | |||
| return | |||
| return EmbeddingModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) | |||
| if llm_type == LLMType.RERANK: | |||
| if model_config["llm_factory"] not in RerankModel: | |||
| return | |||
| return RerankModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) | |||
| if llm_type == LLMType.IMAGE2TEXT.value: | |||
| if model_config["llm_factory"] not in CvModel: | |||
| return | |||
| return CvModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], lang, | |||
| base_url=model_config["api_base"] | |||
| ) | |||
| if llm_type == LLMType.CHAT.value: | |||
| if model_config["llm_factory"] not in ChatModel: | |||
| return | |||
| return ChatModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) | |||
| if llm_type == LLMType.SPEECH2TEXT: | |||
| if model_config["llm_factory"] not in Seq2txtModel: | |||
| return | |||
| return Seq2txtModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], lang, | |||
| base_url=model_config["api_base"] | |||
| ) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None): | |||
| e, tenant = TenantService.get_by_id(tenant_id) | |||
| if not e: | |||
| raise LookupError("Tenant not found") | |||
| if llm_type == LLMType.EMBEDDING.value: | |||
| mdlnm = tenant.embd_id | |||
| elif llm_type == LLMType.SPEECH2TEXT.value: | |||
| mdlnm = tenant.asr_id | |||
| elif llm_type == LLMType.IMAGE2TEXT.value: | |||
| mdlnm = tenant.img2txt_id | |||
| elif llm_type == LLMType.CHAT.value: | |||
| mdlnm = tenant.llm_id if not llm_name else llm_name | |||
| elif llm_type == LLMType.RERANK: | |||
| mdlnm = tenant.llm_id if not llm_name else llm_name | |||
| else: | |||
| assert False, "LLM type error" | |||
| num = 0 | |||
| try: | |||
| for u in cls.query(tenant_id = tenant_id, llm_name=mdlnm): | |||
| num += cls.model.update(used_tokens = u.used_tokens + used_tokens)\ | |||
| .where(cls.model.tenant_id == tenant_id, cls.model.llm_name == mdlnm)\ | |||
| .execute() | |||
| except Exception as e: | |||
| pass | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_openai_models(cls): | |||
| objs = cls.model.select().where( | |||
| (cls.model.llm_factory == "OpenAI"), | |||
| ~(cls.model.llm_name == "text-embedding-3-small"), | |||
| ~(cls.model.llm_name == "text-embedding-3-large") | |||
| ).dicts() | |||
| return list(objs) | |||
| class LLMBundle(object): | |||
| def __init__(self, tenant_id, llm_type, llm_name=None, lang="Chinese"): | |||
| self.tenant_id = tenant_id | |||
| self.llm_type = llm_type | |||
| self.llm_name = llm_name | |||
| self.mdl = TenantLLMService.model_instance( | |||
| tenant_id, llm_type, llm_name, lang=lang) | |||
| assert self.mdl, "Can't find mole for {}/{}/{}".format( | |||
| tenant_id, llm_type, llm_name) | |||
| self.max_length = 512 | |||
| for lm in LLMService.query(llm_name=llm_name): | |||
| self.max_length = lm.max_tokens | |||
| break | |||
| def encode(self, texts: list, batch_size=32): | |||
| emd, used_tokens = self.mdl.encode(texts, batch_size) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/EMBEDDING".format(self.tenant_id)) | |||
| return emd, used_tokens | |||
| def encode_queries(self, query: str): | |||
| emd, used_tokens = self.mdl.encode_queries(query) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/EMBEDDING".format(self.tenant_id)) | |||
| return emd, used_tokens | |||
| def similarity(self, query: str, texts: list): | |||
| sim, used_tokens = self.mdl.similarity(query, texts) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/RERANK".format(self.tenant_id)) | |||
| return sim, used_tokens | |||
| def describe(self, image, max_tokens=300): | |||
| txt, used_tokens = self.mdl.describe(image, max_tokens) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id)) | |||
| return txt | |||
| def transcription(self, audio): | |||
| txt, used_tokens = self.mdl.transcription(audio) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id)) | |||
| return txt | |||
| def chat(self, system, history, gen_conf): | |||
| txt, used_tokens = self.mdl.chat(system, history, gen_conf) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens, self.llm_name): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/CHAT".format(self.tenant_id)) | |||
| return txt | |||
| def chat_streamly(self, system, history, gen_conf): | |||
| for txt in self.mdl.chat_streamly(system, history, gen_conf): | |||
| if isinstance(txt, int): | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, txt, self.llm_name): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/CHAT".format(self.tenant_id)) | |||
| return | |||
| yield txt | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from api.db.services.user_service import TenantService | |||
| from api.settings import database_logger | |||
| from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel | |||
| from api.db import LLMType | |||
| from api.db.db_models import DB, UserTenant | |||
| from api.db.db_models import LLMFactories, LLM, TenantLLM | |||
| from api.db.services.common_service import CommonService | |||
| class LLMFactoriesService(CommonService): | |||
| model = LLMFactories | |||
| class LLMService(CommonService): | |||
| model = LLM | |||
| class TenantLLMService(CommonService): | |||
| model = TenantLLM | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_api_key(cls, tenant_id, model_name): | |||
| objs = cls.query(tenant_id=tenant_id, llm_name=model_name) | |||
| if not objs: | |||
| return | |||
| return objs[0] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_my_llms(cls, tenant_id): | |||
| fields = [ | |||
| cls.model.llm_factory, | |||
| LLMFactories.logo, | |||
| LLMFactories.tags, | |||
| cls.model.model_type, | |||
| cls.model.llm_name, | |||
| cls.model.used_tokens | |||
| ] | |||
| objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where( | |||
| cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts() | |||
| return list(objs) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def model_instance(cls, tenant_id, llm_type, | |||
| llm_name=None, lang="Chinese"): | |||
| e, tenant = TenantService.get_by_id(tenant_id) | |||
| if not e: | |||
| raise LookupError("Tenant not found") | |||
| if llm_type == LLMType.EMBEDDING.value: | |||
| mdlnm = tenant.embd_id if not llm_name else llm_name | |||
| elif llm_type == LLMType.SPEECH2TEXT.value: | |||
| mdlnm = tenant.asr_id | |||
| elif llm_type == LLMType.IMAGE2TEXT.value: | |||
| mdlnm = tenant.img2txt_id if not llm_name else llm_name | |||
| elif llm_type == LLMType.CHAT.value: | |||
| mdlnm = tenant.llm_id if not llm_name else llm_name | |||
| elif llm_type == LLMType.RERANK: | |||
| mdlnm = tenant.rerank_id if not llm_name else llm_name | |||
| else: | |||
| assert False, "LLM type error" | |||
| model_config = cls.get_api_key(tenant_id, mdlnm) | |||
| if model_config: model_config = model_config.to_dict() | |||
| if not model_config: | |||
| if llm_type in [LLMType.EMBEDDING, LLMType.RERANK]: | |||
| llm = LLMService.query(llm_name=llm_name if llm_name else mdlnm) | |||
| if llm and llm[0].fid in ["Youdao", "FastEmbed", "BAAI"]: | |||
| model_config = {"llm_factory": llm[0].fid, "api_key":"", "llm_name": llm_name if llm_name else mdlnm, "api_base": ""} | |||
| if not model_config: | |||
| if llm_name == "flag-embedding": | |||
| model_config = {"llm_factory": "Tongyi-Qianwen", "api_key": "", | |||
| "llm_name": llm_name, "api_base": ""} | |||
| else: | |||
| if not mdlnm: | |||
| raise LookupError(f"Type of {llm_type} model is not set.") | |||
| raise LookupError("Model({}) not authorized".format(mdlnm)) | |||
| if llm_type == LLMType.EMBEDDING.value: | |||
| if model_config["llm_factory"] not in EmbeddingModel: | |||
| return | |||
| return EmbeddingModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) | |||
| if llm_type == LLMType.RERANK: | |||
| if model_config["llm_factory"] not in RerankModel: | |||
| return | |||
| return RerankModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) | |||
| if llm_type == LLMType.IMAGE2TEXT.value: | |||
| if model_config["llm_factory"] not in CvModel: | |||
| return | |||
| return CvModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], lang, | |||
| base_url=model_config["api_base"] | |||
| ) | |||
| if llm_type == LLMType.CHAT.value: | |||
| if model_config["llm_factory"] not in ChatModel: | |||
| return | |||
| return ChatModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) | |||
| if llm_type == LLMType.SPEECH2TEXT: | |||
| if model_config["llm_factory"] not in Seq2txtModel: | |||
| return | |||
| return Seq2txtModel[model_config["llm_factory"]]( | |||
| model_config["api_key"], model_config["llm_name"], lang, | |||
| base_url=model_config["api_base"] | |||
| ) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None): | |||
| e, tenant = TenantService.get_by_id(tenant_id) | |||
| if not e: | |||
| raise LookupError("Tenant not found") | |||
| if llm_type == LLMType.EMBEDDING.value: | |||
| mdlnm = tenant.embd_id | |||
| elif llm_type == LLMType.SPEECH2TEXT.value: | |||
| mdlnm = tenant.asr_id | |||
| elif llm_type == LLMType.IMAGE2TEXT.value: | |||
| mdlnm = tenant.img2txt_id | |||
| elif llm_type == LLMType.CHAT.value: | |||
| mdlnm = tenant.llm_id if not llm_name else llm_name | |||
| elif llm_type == LLMType.RERANK: | |||
| mdlnm = tenant.llm_id if not llm_name else llm_name | |||
| else: | |||
| assert False, "LLM type error" | |||
| num = 0 | |||
| try: | |||
| for u in cls.query(tenant_id = tenant_id, llm_name=mdlnm): | |||
| num += cls.model.update(used_tokens = u.used_tokens + used_tokens)\ | |||
| .where(cls.model.tenant_id == tenant_id, cls.model.llm_name == mdlnm)\ | |||
| .execute() | |||
| except Exception as e: | |||
| pass | |||
| return num | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_openai_models(cls): | |||
| objs = cls.model.select().where( | |||
| (cls.model.llm_factory == "OpenAI"), | |||
| ~(cls.model.llm_name == "text-embedding-3-small"), | |||
| ~(cls.model.llm_name == "text-embedding-3-large") | |||
| ).dicts() | |||
| return list(objs) | |||
| class LLMBundle(object): | |||
| def __init__(self, tenant_id, llm_type, llm_name=None, lang="Chinese"): | |||
| self.tenant_id = tenant_id | |||
| self.llm_type = llm_type | |||
| self.llm_name = llm_name | |||
| self.mdl = TenantLLMService.model_instance( | |||
| tenant_id, llm_type, llm_name, lang=lang) | |||
| assert self.mdl, "Can't find mole for {}/{}/{}".format( | |||
| tenant_id, llm_type, llm_name) | |||
| self.max_length = 512 | |||
| for lm in LLMService.query(llm_name=llm_name): | |||
| self.max_length = lm.max_tokens | |||
| break | |||
| def encode(self, texts: list, batch_size=32): | |||
| emd, used_tokens = self.mdl.encode(texts, batch_size) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/EMBEDDING".format(self.tenant_id)) | |||
| return emd, used_tokens | |||
| def encode_queries(self, query: str): | |||
| emd, used_tokens = self.mdl.encode_queries(query) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/EMBEDDING".format(self.tenant_id)) | |||
| return emd, used_tokens | |||
| def similarity(self, query: str, texts: list): | |||
| sim, used_tokens = self.mdl.similarity(query, texts) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/RERANK".format(self.tenant_id)) | |||
| return sim, used_tokens | |||
| def describe(self, image, max_tokens=300): | |||
| txt, used_tokens = self.mdl.describe(image, max_tokens) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id)) | |||
| return txt | |||
| def transcription(self, audio): | |||
| txt, used_tokens = self.mdl.transcription(audio) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id)) | |||
| return txt | |||
| def chat(self, system, history, gen_conf): | |||
| txt, used_tokens = self.mdl.chat(system, history, gen_conf) | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, used_tokens, self.llm_name): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/CHAT".format(self.tenant_id)) | |||
| return txt | |||
| def chat_streamly(self, system, history, gen_conf): | |||
| for txt in self.mdl.chat_streamly(system, history, gen_conf): | |||
| if isinstance(txt, int): | |||
| if not TenantLLMService.increase_usage( | |||
| self.tenant_id, self.llm_type, txt, self.llm_name): | |||
| database_logger.error( | |||
| "Can't update token usage for {}/CHAT".format(self.tenant_id)) | |||
| return | |||
| yield txt | |||
| @@ -1,175 +1,175 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import random | |||
| from api.db.db_utils import bulk_insert_into_db | |||
| from deepdoc.parser import PdfParser | |||
| from peewee import JOIN | |||
| from api.db.db_models import DB, File2Document, File | |||
| from api.db import StatusEnum, FileType, TaskStatus | |||
| from api.db.db_models import Task, Document, Knowledgebase, Tenant | |||
| from api.db.services.common_service import CommonService | |||
| from api.db.services.document_service import DocumentService | |||
| from api.utils import current_timestamp, get_uuid | |||
| from deepdoc.parser.excel_parser import RAGFlowExcelParser | |||
| from rag.settings import SVR_QUEUE_NAME | |||
| from rag.utils.minio_conn import MINIO | |||
| from rag.utils.redis_conn import REDIS_CONN | |||
| class TaskService(CommonService): | |||
| model = Task | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_tasks(cls, task_id): | |||
| fields = [ | |||
| cls.model.id, | |||
| cls.model.doc_id, | |||
| cls.model.from_page, | |||
| cls.model.to_page, | |||
| Document.kb_id, | |||
| Document.parser_id, | |||
| Document.parser_config, | |||
| Document.name, | |||
| Document.type, | |||
| Document.location, | |||
| Document.size, | |||
| Knowledgebase.tenant_id, | |||
| Knowledgebase.language, | |||
| Knowledgebase.embd_id, | |||
| Tenant.img2txt_id, | |||
| Tenant.asr_id, | |||
| Tenant.llm_id, | |||
| cls.model.update_time] | |||
| docs = cls.model.select(*fields) \ | |||
| .join(Document, on=(cls.model.doc_id == Document.id)) \ | |||
| .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \ | |||
| .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \ | |||
| .where(cls.model.id == task_id) | |||
| docs = list(docs.dicts()) | |||
| if not docs: return [] | |||
| cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", | |||
| progress=random.random() / 10.).where( | |||
| cls.model.id == docs[0]["id"]).execute() | |||
| return docs | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_ongoing_doc_name(cls): | |||
| with DB.lock("get_task", -1): | |||
| docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \ | |||
| .join(Document, on=(cls.model.doc_id == Document.id)) \ | |||
| .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \ | |||
| .join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \ | |||
| .where( | |||
| Document.status == StatusEnum.VALID.value, | |||
| Document.run == TaskStatus.RUNNING.value, | |||
| ~(Document.type == FileType.VIRTUAL.value), | |||
| cls.model.progress < 1, | |||
| cls.model.create_time >= current_timestamp() - 1000 * 600 | |||
| ) | |||
| docs = list(docs.dicts()) | |||
| if not docs: return [] | |||
| return list(set([(d["parent_id"] if d["parent_id"] else d["kb_id"], d["location"]) for d in docs])) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def do_cancel(cls, id): | |||
| try: | |||
| task = cls.model.get_by_id(id) | |||
| _, doc = DocumentService.get_by_id(task.doc_id) | |||
| return doc.run == TaskStatus.CANCEL.value or doc.progress < 0 | |||
| except Exception as e: | |||
| pass | |||
| return False | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_progress(cls, id, info): | |||
| if os.environ.get("MACOS"): | |||
| if info["progress_msg"]: | |||
| cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where( | |||
| cls.model.id == id).execute() | |||
| if "progress" in info: | |||
| cls.model.update(progress=info["progress"]).where( | |||
| cls.model.id == id).execute() | |||
| return | |||
| with DB.lock("update_progress", -1): | |||
| if info["progress_msg"]: | |||
| cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where( | |||
| cls.model.id == id).execute() | |||
| if "progress" in info: | |||
| cls.model.update(progress=info["progress"]).where( | |||
| cls.model.id == id).execute() | |||
| def queue_tasks(doc, bucket, name): | |||
| def new_task(): | |||
| nonlocal doc | |||
| return { | |||
| "id": get_uuid(), | |||
| "doc_id": doc["id"] | |||
| } | |||
| tsks = [] | |||
| if doc["type"] == FileType.PDF.value: | |||
| file_bin = MINIO.get(bucket, name) | |||
| do_layout = doc["parser_config"].get("layout_recognize", True) | |||
| pages = PdfParser.total_page_number(doc["name"], file_bin) | |||
| page_size = doc["parser_config"].get("task_page_size", 12) | |||
| if doc["parser_id"] == "paper": | |||
| page_size = doc["parser_config"].get("task_page_size", 22) | |||
| if doc["parser_id"] == "one": | |||
| page_size = 1000000000 | |||
| if doc["parser_id"] == "knowledge_graph": | |||
| page_size = 1000000000 | |||
| if not do_layout: | |||
| page_size = 1000000000 | |||
| page_ranges = doc["parser_config"].get("pages") | |||
| if not page_ranges: | |||
| page_ranges = [(1, 100000)] | |||
| for s, e in page_ranges: | |||
| s -= 1 | |||
| s = max(0, s) | |||
| e = min(e - 1, pages) | |||
| for p in range(s, e, page_size): | |||
| task = new_task() | |||
| task["from_page"] = p | |||
| task["to_page"] = min(p + page_size, e) | |||
| tsks.append(task) | |||
| elif doc["parser_id"] == "table": | |||
| file_bin = MINIO.get(bucket, name) | |||
| rn = RAGFlowExcelParser.row_number( | |||
| doc["name"], file_bin) | |||
| for i in range(0, rn, 3000): | |||
| task = new_task() | |||
| task["from_page"] = i | |||
| task["to_page"] = min(i + 3000, rn) | |||
| tsks.append(task) | |||
| else: | |||
| tsks.append(new_task()) | |||
| bulk_insert_into_db(Task, tsks, True) | |||
| DocumentService.begin2parse(doc["id"]) | |||
| for t in tsks: | |||
| assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t), "Can't access Redis. Please check the Redis' status." | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import random | |||
| from api.db.db_utils import bulk_insert_into_db | |||
| from deepdoc.parser import PdfParser | |||
| from peewee import JOIN | |||
| from api.db.db_models import DB, File2Document, File | |||
| from api.db import StatusEnum, FileType, TaskStatus | |||
| from api.db.db_models import Task, Document, Knowledgebase, Tenant | |||
| from api.db.services.common_service import CommonService | |||
| from api.db.services.document_service import DocumentService | |||
| from api.utils import current_timestamp, get_uuid | |||
| from deepdoc.parser.excel_parser import RAGFlowExcelParser | |||
| from rag.settings import SVR_QUEUE_NAME | |||
| from rag.utils.minio_conn import MINIO | |||
| from rag.utils.redis_conn import REDIS_CONN | |||
| class TaskService(CommonService): | |||
| model = Task | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_tasks(cls, task_id): | |||
| fields = [ | |||
| cls.model.id, | |||
| cls.model.doc_id, | |||
| cls.model.from_page, | |||
| cls.model.to_page, | |||
| Document.kb_id, | |||
| Document.parser_id, | |||
| Document.parser_config, | |||
| Document.name, | |||
| Document.type, | |||
| Document.location, | |||
| Document.size, | |||
| Knowledgebase.tenant_id, | |||
| Knowledgebase.language, | |||
| Knowledgebase.embd_id, | |||
| Tenant.img2txt_id, | |||
| Tenant.asr_id, | |||
| Tenant.llm_id, | |||
| cls.model.update_time] | |||
| docs = cls.model.select(*fields) \ | |||
| .join(Document, on=(cls.model.doc_id == Document.id)) \ | |||
| .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \ | |||
| .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \ | |||
| .where(cls.model.id == task_id) | |||
| docs = list(docs.dicts()) | |||
| if not docs: return [] | |||
| cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", | |||
| progress=random.random() / 10.).where( | |||
| cls.model.id == docs[0]["id"]).execute() | |||
| return docs | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_ongoing_doc_name(cls): | |||
| with DB.lock("get_task", -1): | |||
| docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \ | |||
| .join(Document, on=(cls.model.doc_id == Document.id)) \ | |||
| .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \ | |||
| .join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \ | |||
| .where( | |||
| Document.status == StatusEnum.VALID.value, | |||
| Document.run == TaskStatus.RUNNING.value, | |||
| ~(Document.type == FileType.VIRTUAL.value), | |||
| cls.model.progress < 1, | |||
| cls.model.create_time >= current_timestamp() - 1000 * 600 | |||
| ) | |||
| docs = list(docs.dicts()) | |||
| if not docs: return [] | |||
| return list(set([(d["parent_id"] if d["parent_id"] else d["kb_id"], d["location"]) for d in docs])) | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def do_cancel(cls, id): | |||
| try: | |||
| task = cls.model.get_by_id(id) | |||
| _, doc = DocumentService.get_by_id(task.doc_id) | |||
| return doc.run == TaskStatus.CANCEL.value or doc.progress < 0 | |||
| except Exception as e: | |||
| pass | |||
| return False | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def update_progress(cls, id, info): | |||
| if os.environ.get("MACOS"): | |||
| if info["progress_msg"]: | |||
| cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where( | |||
| cls.model.id == id).execute() | |||
| if "progress" in info: | |||
| cls.model.update(progress=info["progress"]).where( | |||
| cls.model.id == id).execute() | |||
| return | |||
| with DB.lock("update_progress", -1): | |||
| if info["progress_msg"]: | |||
| cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where( | |||
| cls.model.id == id).execute() | |||
| if "progress" in info: | |||
| cls.model.update(progress=info["progress"]).where( | |||
| cls.model.id == id).execute() | |||
| def queue_tasks(doc, bucket, name): | |||
| def new_task(): | |||
| nonlocal doc | |||
| return { | |||
| "id": get_uuid(), | |||
| "doc_id": doc["id"] | |||
| } | |||
| tsks = [] | |||
| if doc["type"] == FileType.PDF.value: | |||
| file_bin = MINIO.get(bucket, name) | |||
| do_layout = doc["parser_config"].get("layout_recognize", True) | |||
| pages = PdfParser.total_page_number(doc["name"], file_bin) | |||
| page_size = doc["parser_config"].get("task_page_size", 12) | |||
| if doc["parser_id"] == "paper": | |||
| page_size = doc["parser_config"].get("task_page_size", 22) | |||
| if doc["parser_id"] == "one": | |||
| page_size = 1000000000 | |||
| if doc["parser_id"] == "knowledge_graph": | |||
| page_size = 1000000000 | |||
| if not do_layout: | |||
| page_size = 1000000000 | |||
| page_ranges = doc["parser_config"].get("pages") | |||
| if not page_ranges: | |||
| page_ranges = [(1, 100000)] | |||
| for s, e in page_ranges: | |||
| s -= 1 | |||
| s = max(0, s) | |||
| e = min(e - 1, pages) | |||
| for p in range(s, e, page_size): | |||
| task = new_task() | |||
| task["from_page"] = p | |||
| task["to_page"] = min(p + page_size, e) | |||
| tsks.append(task) | |||
| elif doc["parser_id"] == "table": | |||
| file_bin = MINIO.get(bucket, name) | |||
| rn = RAGFlowExcelParser.row_number( | |||
| doc["name"], file_bin) | |||
| for i in range(0, rn, 3000): | |||
| task = new_task() | |||
| task["from_page"] = i | |||
| task["to_page"] = min(i + 3000, rn) | |||
| tsks.append(task) | |||
| else: | |||
| tsks.append(new_task()) | |||
| bulk_insert_into_db(Task, tsks, True) | |||
| DocumentService.begin2parse(doc["id"]) | |||
| for t in tsks: | |||
| assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t), "Can't access Redis. Please check the Redis' status." | |||
| @@ -1,100 +1,100 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import logging | |||
| import os | |||
| import signal | |||
| import sys | |||
| import time | |||
| import traceback | |||
| from concurrent.futures import ThreadPoolExecutor | |||
| from werkzeug.serving import run_simple | |||
| from api.apps import app | |||
| from api.db.runtime_config import RuntimeConfig | |||
| from api.db.services.document_service import DocumentService | |||
| from api.settings import ( | |||
| HOST, HTTP_PORT, access_logger, database_logger, stat_logger, | |||
| ) | |||
| from api import utils | |||
| from api.db.db_models import init_database_tables as init_web_db | |||
| from api.db.init_data import init_web_data | |||
| from api.versions import get_versions | |||
| def update_progress(): | |||
| while True: | |||
| time.sleep(1) | |||
| try: | |||
| DocumentService.update_progress() | |||
| except Exception as e: | |||
| stat_logger.error("update_progress exception:" + str(e)) | |||
| if __name__ == '__main__': | |||
| print(""" | |||
| ____ ______ __ | |||
| / __ \ ____ _ ____ _ / ____// /____ _ __ | |||
| / /_/ // __ `// __ `// /_ / // __ \| | /| / / | |||
| / _, _// /_/ // /_/ // __/ / // /_/ /| |/ |/ / | |||
| /_/ |_| \__,_/ \__, //_/ /_/ \____/ |__/|__/ | |||
| /____/ | |||
| """, flush=True) | |||
| stat_logger.info( | |||
| f'project base: {utils.file_utils.get_project_base_directory()}' | |||
| ) | |||
| # init db | |||
| init_web_db() | |||
| init_web_data() | |||
| # init runtime config | |||
| import argparse | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument('--version', default=False, help="rag flow version", action='store_true') | |||
| parser.add_argument('--debug', default=False, help="debug mode", action='store_true') | |||
| args = parser.parse_args() | |||
| if args.version: | |||
| print(get_versions()) | |||
| sys.exit(0) | |||
| RuntimeConfig.DEBUG = args.debug | |||
| if RuntimeConfig.DEBUG: | |||
| stat_logger.info("run on debug mode") | |||
| RuntimeConfig.init_env() | |||
| RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT) | |||
| peewee_logger = logging.getLogger('peewee') | |||
| peewee_logger.propagate = False | |||
| # rag_arch.common.log.ROpenHandler | |||
| peewee_logger.addHandler(database_logger.handlers[0]) | |||
| peewee_logger.setLevel(database_logger.level) | |||
| thr = ThreadPoolExecutor(max_workers=1) | |||
| thr.submit(update_progress) | |||
| # start http server | |||
| try: | |||
| stat_logger.info("RAG Flow http server start...") | |||
| werkzeug_logger = logging.getLogger("werkzeug") | |||
| for h in access_logger.handlers: | |||
| werkzeug_logger.addHandler(h) | |||
| run_simple(hostname=HOST, port=HTTP_PORT, application=app, threaded=True, use_reloader=RuntimeConfig.DEBUG, use_debugger=RuntimeConfig.DEBUG) | |||
| except Exception: | |||
| traceback.print_exc() | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import logging | |||
| import os | |||
| import signal | |||
| import sys | |||
| import time | |||
| import traceback | |||
| from concurrent.futures import ThreadPoolExecutor | |||
| from werkzeug.serving import run_simple | |||
| from api.apps import app | |||
| from api.db.runtime_config import RuntimeConfig | |||
| from api.db.services.document_service import DocumentService | |||
| from api.settings import ( | |||
| HOST, HTTP_PORT, access_logger, database_logger, stat_logger, | |||
| ) | |||
| from api import utils | |||
| from api.db.db_models import init_database_tables as init_web_db | |||
| from api.db.init_data import init_web_data | |||
| from api.versions import get_versions | |||
| def update_progress(): | |||
| while True: | |||
| time.sleep(1) | |||
| try: | |||
| DocumentService.update_progress() | |||
| except Exception as e: | |||
| stat_logger.error("update_progress exception:" + str(e)) | |||
| if __name__ == '__main__': | |||
| print(""" | |||
| ____ ______ __ | |||
| / __ \ ____ _ ____ _ / ____// /____ _ __ | |||
| / /_/ // __ `// __ `// /_ / // __ \| | /| / / | |||
| / _, _// /_/ // /_/ // __/ / // /_/ /| |/ |/ / | |||
| /_/ |_| \__,_/ \__, //_/ /_/ \____/ |__/|__/ | |||
| /____/ | |||
| """, flush=True) | |||
| stat_logger.info( | |||
| f'project base: {utils.file_utils.get_project_base_directory()}' | |||
| ) | |||
| # init db | |||
| init_web_db() | |||
| init_web_data() | |||
| # init runtime config | |||
| import argparse | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument('--version', default=False, help="rag flow version", action='store_true') | |||
| parser.add_argument('--debug', default=False, help="debug mode", action='store_true') | |||
| args = parser.parse_args() | |||
| if args.version: | |||
| print(get_versions()) | |||
| sys.exit(0) | |||
| RuntimeConfig.DEBUG = args.debug | |||
| if RuntimeConfig.DEBUG: | |||
| stat_logger.info("run on debug mode") | |||
| RuntimeConfig.init_env() | |||
| RuntimeConfig.init_config(JOB_SERVER_HOST=HOST, HTTP_PORT=HTTP_PORT) | |||
| peewee_logger = logging.getLogger('peewee') | |||
| peewee_logger.propagate = False | |||
| # rag_arch.common.log.ROpenHandler | |||
| peewee_logger.addHandler(database_logger.handlers[0]) | |||
| peewee_logger.setLevel(database_logger.level) | |||
| thr = ThreadPoolExecutor(max_workers=1) | |||
| thr.submit(update_progress) | |||
| # start http server | |||
| try: | |||
| stat_logger.info("RAG Flow http server start...") | |||
| werkzeug_logger = logging.getLogger("werkzeug") | |||
| for h in access_logger.handlers: | |||
| werkzeug_logger.addHandler(h) | |||
| run_simple(hostname=HOST, port=HTTP_PORT, application=app, threaded=True, use_reloader=RuntimeConfig.DEBUG, use_debugger=RuntimeConfig.DEBUG) | |||
| except Exception: | |||
| traceback.print_exc() | |||
| os.kill(os.getpid(), signal.SIGKILL) | |||
| @@ -1,251 +1,251 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| from enum import IntEnum, Enum | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from api.utils.log_utils import LoggerFactory, getLogger | |||
| # Logger | |||
| LoggerFactory.set_directory( | |||
| os.path.join( | |||
| get_project_base_directory(), | |||
| "logs", | |||
| "api")) | |||
| # {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0} | |||
| LoggerFactory.LEVEL = 30 | |||
| stat_logger = getLogger("stat") | |||
| access_logger = getLogger("access") | |||
| database_logger = getLogger("database") | |||
| chat_logger = getLogger("chat") | |||
| from rag.utils.es_conn import ELASTICSEARCH | |||
| from rag.nlp import search | |||
| from graphrag import search as kg_search | |||
| from api.utils import get_base_config, decrypt_database_config | |||
| API_VERSION = "v1" | |||
| RAG_FLOW_SERVICE_NAME = "ragflow" | |||
| SERVER_MODULE = "rag_flow_server.py" | |||
| TEMP_DIRECTORY = os.path.join(get_project_base_directory(), "temp") | |||
| RAG_FLOW_CONF_PATH = os.path.join(get_project_base_directory(), "conf") | |||
| SUBPROCESS_STD_LOG_NAME = "std.log" | |||
| ERROR_REPORT = True | |||
| ERROR_REPORT_WITH_PATH = False | |||
| MAX_TIMESTAMP_INTERVAL = 60 | |||
| SESSION_VALID_PERIOD = 7 * 24 * 60 * 60 | |||
| REQUEST_TRY_TIMES = 3 | |||
| REQUEST_WAIT_SEC = 2 | |||
| REQUEST_MAX_WAIT_SEC = 300 | |||
| USE_REGISTRY = get_base_config("use_registry") | |||
| default_llm = { | |||
| "Tongyi-Qianwen": { | |||
| "chat_model": "qwen-plus", | |||
| "embedding_model": "text-embedding-v2", | |||
| "image2text_model": "qwen-vl-max", | |||
| "asr_model": "paraformer-realtime-8k-v1", | |||
| }, | |||
| "OpenAI": { | |||
| "chat_model": "gpt-3.5-turbo", | |||
| "embedding_model": "text-embedding-ada-002", | |||
| "image2text_model": "gpt-4-vision-preview", | |||
| "asr_model": "whisper-1", | |||
| }, | |||
| "Azure-OpenAI": { | |||
| "chat_model": "azure-gpt-35-turbo", | |||
| "embedding_model": "azure-text-embedding-ada-002", | |||
| "image2text_model": "azure-gpt-4-vision-preview", | |||
| "asr_model": "azure-whisper-1", | |||
| }, | |||
| "ZHIPU-AI": { | |||
| "chat_model": "glm-3-turbo", | |||
| "embedding_model": "embedding-2", | |||
| "image2text_model": "glm-4v", | |||
| "asr_model": "", | |||
| }, | |||
| "Ollama": { | |||
| "chat_model": "qwen-14B-chat", | |||
| "embedding_model": "flag-embedding", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| }, | |||
| "Moonshot": { | |||
| "chat_model": "moonshot-v1-8k", | |||
| "embedding_model": "", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| }, | |||
| "DeepSeek": { | |||
| "chat_model": "deepseek-chat", | |||
| "embedding_model": "", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| }, | |||
| "VolcEngine": { | |||
| "chat_model": "", | |||
| "embedding_model": "", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| }, | |||
| "BAAI": { | |||
| "chat_model": "", | |||
| "embedding_model": "BAAI/bge-large-zh-v1.5", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| "rerank_model": "BAAI/bge-reranker-v2-m3", | |||
| } | |||
| } | |||
| LLM = get_base_config("user_default_llm", {}) | |||
| LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen") | |||
| LLM_BASE_URL = LLM.get("base_url") | |||
| if LLM_FACTORY not in default_llm: | |||
| print( | |||
| "\33[91m【ERROR】\33[0m:", | |||
| f"LLM factory {LLM_FACTORY} has not supported yet, switch to 'Tongyi-Qianwen/QWen' automatically, and please check the API_KEY in service_conf.yaml.") | |||
| LLM_FACTORY = "Tongyi-Qianwen" | |||
| CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"] | |||
| EMBEDDING_MDL = default_llm["BAAI"]["embedding_model"] | |||
| RERANK_MDL = default_llm["BAAI"]["rerank_model"] | |||
| ASR_MDL = default_llm[LLM_FACTORY]["asr_model"] | |||
| IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] | |||
| API_KEY = LLM.get("api_key", "") | |||
| PARSERS = LLM.get( | |||
| "parsers", | |||
| "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email") | |||
| # distribution | |||
| DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) | |||
| RAG_FLOW_UPDATE_CHECK = False | |||
| HOST = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("host", "127.0.0.1") | |||
| HTTP_PORT = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("http_port") | |||
| SECRET_KEY = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, | |||
| {}).get( | |||
| "secret_key", | |||
| "infiniflow") | |||
| TOKEN_EXPIRE_IN = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, {}).get( | |||
| "token_expires_in", 3600) | |||
| NGINX_HOST = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, {}).get( | |||
| "nginx", {}).get("host") or HOST | |||
| NGINX_HTTP_PORT = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, {}).get( | |||
| "nginx", {}).get("http_port") or HTTP_PORT | |||
| RANDOM_INSTANCE_ID = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, {}).get( | |||
| "random_instance_id", False) | |||
| PROXY = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("proxy") | |||
| PROXY_PROTOCOL = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("protocol") | |||
| DATABASE = decrypt_database_config(name="mysql") | |||
| # Switch | |||
| # upload | |||
| UPLOAD_DATA_FROM_CLIENT = True | |||
| # authentication | |||
| AUTHENTICATION_CONF = get_base_config("authentication", {}) | |||
| # client | |||
| CLIENT_AUTHENTICATION = AUTHENTICATION_CONF.get( | |||
| "client", {}).get( | |||
| "switch", False) | |||
| HTTP_APP_KEY = AUTHENTICATION_CONF.get("client", {}).get("http_app_key") | |||
| GITHUB_OAUTH = get_base_config("oauth", {}).get("github") | |||
| FEISHU_OAUTH = get_base_config("oauth", {}).get("feishu") | |||
| WECHAT_OAUTH = get_base_config("oauth", {}).get("wechat") | |||
| # site | |||
| SITE_AUTHENTICATION = AUTHENTICATION_CONF.get("site", {}).get("switch", False) | |||
| # permission | |||
| PERMISSION_CONF = get_base_config("permission", {}) | |||
| PERMISSION_SWITCH = PERMISSION_CONF.get("switch") | |||
| COMPONENT_PERMISSION = PERMISSION_CONF.get("component") | |||
| DATASET_PERMISSION = PERMISSION_CONF.get("dataset") | |||
| HOOK_MODULE = get_base_config("hook_module") | |||
| HOOK_SERVER_NAME = get_base_config("hook_server_name") | |||
| ENABLE_MODEL_STORE = get_base_config('enable_model_store', False) | |||
| # authentication | |||
| USE_AUTHENTICATION = False | |||
| USE_DATA_AUTHENTICATION = False | |||
| AUTOMATIC_AUTHORIZATION_OUTPUT_DATA = True | |||
| USE_DEFAULT_TIMEOUT = False | |||
| AUTHENTICATION_DEFAULT_TIMEOUT = 7 * 24 * 60 * 60 # s | |||
| PRIVILEGE_COMMAND_WHITELIST = [] | |||
| CHECK_NODES_IDENTITY = False | |||
| retrievaler = search.Dealer(ELASTICSEARCH) | |||
| kg_retrievaler = kg_search.KGSearch(ELASTICSEARCH) | |||
| class CustomEnum(Enum): | |||
| @classmethod | |||
| def valid(cls, value): | |||
| try: | |||
| cls(value) | |||
| return True | |||
| except BaseException: | |||
| return False | |||
| @classmethod | |||
| def values(cls): | |||
| return [member.value for member in cls.__members__.values()] | |||
| @classmethod | |||
| def names(cls): | |||
| return [member.name for member in cls.__members__.values()] | |||
| class PythonDependenceName(CustomEnum): | |||
| Rag_Source_Code = "python" | |||
| Python_Env = "miniconda" | |||
| class ModelStorage(CustomEnum): | |||
| REDIS = "redis" | |||
| MYSQL = "mysql" | |||
| class RetCode(IntEnum, CustomEnum): | |||
| SUCCESS = 0 | |||
| NOT_EFFECTIVE = 10 | |||
| EXCEPTION_ERROR = 100 | |||
| ARGUMENT_ERROR = 101 | |||
| DATA_ERROR = 102 | |||
| OPERATING_ERROR = 103 | |||
| CONNECTION_ERROR = 105 | |||
| RUNNING = 106 | |||
| PERMISSION_ERROR = 108 | |||
| AUTHENTICATION_ERROR = 109 | |||
| UNAUTHORIZED = 401 | |||
| SERVER_ERROR = 500 | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| from enum import IntEnum, Enum | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from api.utils.log_utils import LoggerFactory, getLogger | |||
| # Logger | |||
| LoggerFactory.set_directory( | |||
| os.path.join( | |||
| get_project_base_directory(), | |||
| "logs", | |||
| "api")) | |||
| # {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0} | |||
| LoggerFactory.LEVEL = 30 | |||
| stat_logger = getLogger("stat") | |||
| access_logger = getLogger("access") | |||
| database_logger = getLogger("database") | |||
| chat_logger = getLogger("chat") | |||
| from rag.utils.es_conn import ELASTICSEARCH | |||
| from rag.nlp import search | |||
| from graphrag import search as kg_search | |||
| from api.utils import get_base_config, decrypt_database_config | |||
| API_VERSION = "v1" | |||
| RAG_FLOW_SERVICE_NAME = "ragflow" | |||
| SERVER_MODULE = "rag_flow_server.py" | |||
| TEMP_DIRECTORY = os.path.join(get_project_base_directory(), "temp") | |||
| RAG_FLOW_CONF_PATH = os.path.join(get_project_base_directory(), "conf") | |||
| SUBPROCESS_STD_LOG_NAME = "std.log" | |||
| ERROR_REPORT = True | |||
| ERROR_REPORT_WITH_PATH = False | |||
| MAX_TIMESTAMP_INTERVAL = 60 | |||
| SESSION_VALID_PERIOD = 7 * 24 * 60 * 60 | |||
| REQUEST_TRY_TIMES = 3 | |||
| REQUEST_WAIT_SEC = 2 | |||
| REQUEST_MAX_WAIT_SEC = 300 | |||
| USE_REGISTRY = get_base_config("use_registry") | |||
| default_llm = { | |||
| "Tongyi-Qianwen": { | |||
| "chat_model": "qwen-plus", | |||
| "embedding_model": "text-embedding-v2", | |||
| "image2text_model": "qwen-vl-max", | |||
| "asr_model": "paraformer-realtime-8k-v1", | |||
| }, | |||
| "OpenAI": { | |||
| "chat_model": "gpt-3.5-turbo", | |||
| "embedding_model": "text-embedding-ada-002", | |||
| "image2text_model": "gpt-4-vision-preview", | |||
| "asr_model": "whisper-1", | |||
| }, | |||
| "Azure-OpenAI": { | |||
| "chat_model": "azure-gpt-35-turbo", | |||
| "embedding_model": "azure-text-embedding-ada-002", | |||
| "image2text_model": "azure-gpt-4-vision-preview", | |||
| "asr_model": "azure-whisper-1", | |||
| }, | |||
| "ZHIPU-AI": { | |||
| "chat_model": "glm-3-turbo", | |||
| "embedding_model": "embedding-2", | |||
| "image2text_model": "glm-4v", | |||
| "asr_model": "", | |||
| }, | |||
| "Ollama": { | |||
| "chat_model": "qwen-14B-chat", | |||
| "embedding_model": "flag-embedding", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| }, | |||
| "Moonshot": { | |||
| "chat_model": "moonshot-v1-8k", | |||
| "embedding_model": "", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| }, | |||
| "DeepSeek": { | |||
| "chat_model": "deepseek-chat", | |||
| "embedding_model": "", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| }, | |||
| "VolcEngine": { | |||
| "chat_model": "", | |||
| "embedding_model": "", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| }, | |||
| "BAAI": { | |||
| "chat_model": "", | |||
| "embedding_model": "BAAI/bge-large-zh-v1.5", | |||
| "image2text_model": "", | |||
| "asr_model": "", | |||
| "rerank_model": "BAAI/bge-reranker-v2-m3", | |||
| } | |||
| } | |||
| LLM = get_base_config("user_default_llm", {}) | |||
| LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen") | |||
| LLM_BASE_URL = LLM.get("base_url") | |||
| if LLM_FACTORY not in default_llm: | |||
| print( | |||
| "\33[91m【ERROR】\33[0m:", | |||
| f"LLM factory {LLM_FACTORY} has not supported yet, switch to 'Tongyi-Qianwen/QWen' automatically, and please check the API_KEY in service_conf.yaml.") | |||
| LLM_FACTORY = "Tongyi-Qianwen" | |||
| CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"] | |||
| EMBEDDING_MDL = default_llm["BAAI"]["embedding_model"] | |||
| RERANK_MDL = default_llm["BAAI"]["rerank_model"] | |||
| ASR_MDL = default_llm[LLM_FACTORY]["asr_model"] | |||
| IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] | |||
| API_KEY = LLM.get("api_key", "") | |||
| PARSERS = LLM.get( | |||
| "parsers", | |||
| "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email") | |||
| # distribution | |||
| DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) | |||
| RAG_FLOW_UPDATE_CHECK = False | |||
| HOST = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("host", "127.0.0.1") | |||
| HTTP_PORT = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("http_port") | |||
| SECRET_KEY = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, | |||
| {}).get( | |||
| "secret_key", | |||
| "infiniflow") | |||
| TOKEN_EXPIRE_IN = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, {}).get( | |||
| "token_expires_in", 3600) | |||
| NGINX_HOST = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, {}).get( | |||
| "nginx", {}).get("host") or HOST | |||
| NGINX_HTTP_PORT = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, {}).get( | |||
| "nginx", {}).get("http_port") or HTTP_PORT | |||
| RANDOM_INSTANCE_ID = get_base_config( | |||
| RAG_FLOW_SERVICE_NAME, {}).get( | |||
| "random_instance_id", False) | |||
| PROXY = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("proxy") | |||
| PROXY_PROTOCOL = get_base_config(RAG_FLOW_SERVICE_NAME, {}).get("protocol") | |||
| DATABASE = decrypt_database_config(name="mysql") | |||
| # Switch | |||
| # upload | |||
| UPLOAD_DATA_FROM_CLIENT = True | |||
| # authentication | |||
| AUTHENTICATION_CONF = get_base_config("authentication", {}) | |||
| # client | |||
| CLIENT_AUTHENTICATION = AUTHENTICATION_CONF.get( | |||
| "client", {}).get( | |||
| "switch", False) | |||
| HTTP_APP_KEY = AUTHENTICATION_CONF.get("client", {}).get("http_app_key") | |||
| GITHUB_OAUTH = get_base_config("oauth", {}).get("github") | |||
| FEISHU_OAUTH = get_base_config("oauth", {}).get("feishu") | |||
| WECHAT_OAUTH = get_base_config("oauth", {}).get("wechat") | |||
| # site | |||
| SITE_AUTHENTICATION = AUTHENTICATION_CONF.get("site", {}).get("switch", False) | |||
| # permission | |||
| PERMISSION_CONF = get_base_config("permission", {}) | |||
| PERMISSION_SWITCH = PERMISSION_CONF.get("switch") | |||
| COMPONENT_PERMISSION = PERMISSION_CONF.get("component") | |||
| DATASET_PERMISSION = PERMISSION_CONF.get("dataset") | |||
| HOOK_MODULE = get_base_config("hook_module") | |||
| HOOK_SERVER_NAME = get_base_config("hook_server_name") | |||
| ENABLE_MODEL_STORE = get_base_config('enable_model_store', False) | |||
| # authentication | |||
| USE_AUTHENTICATION = False | |||
| USE_DATA_AUTHENTICATION = False | |||
| AUTOMATIC_AUTHORIZATION_OUTPUT_DATA = True | |||
| USE_DEFAULT_TIMEOUT = False | |||
| AUTHENTICATION_DEFAULT_TIMEOUT = 7 * 24 * 60 * 60 # s | |||
| PRIVILEGE_COMMAND_WHITELIST = [] | |||
| CHECK_NODES_IDENTITY = False | |||
| retrievaler = search.Dealer(ELASTICSEARCH) | |||
| kg_retrievaler = kg_search.KGSearch(ELASTICSEARCH) | |||
| class CustomEnum(Enum): | |||
| @classmethod | |||
| def valid(cls, value): | |||
| try: | |||
| cls(value) | |||
| return True | |||
| except BaseException: | |||
| return False | |||
| @classmethod | |||
| def values(cls): | |||
| return [member.value for member in cls.__members__.values()] | |||
| @classmethod | |||
| def names(cls): | |||
| return [member.name for member in cls.__members__.values()] | |||
| class PythonDependenceName(CustomEnum): | |||
| Rag_Source_Code = "python" | |||
| Python_Env = "miniconda" | |||
| class ModelStorage(CustomEnum): | |||
| REDIS = "redis" | |||
| MYSQL = "mysql" | |||
| class RetCode(IntEnum, CustomEnum): | |||
| SUCCESS = 0 | |||
| NOT_EFFECTIVE = 10 | |||
| EXCEPTION_ERROR = 100 | |||
| ARGUMENT_ERROR = 101 | |||
| DATA_ERROR = 102 | |||
| OPERATING_ERROR = 103 | |||
| CONNECTION_ERROR = 105 | |||
| RUNNING = 106 | |||
| PERMISSION_ERROR = 108 | |||
| AUTHENTICATION_ERROR = 109 | |||
| UNAUTHORIZED = 401 | |||
| SERVER_ERROR = 500 | |||
| @@ -1,346 +1,346 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import base64 | |||
| import datetime | |||
| import io | |||
| import json | |||
| import os | |||
| import pickle | |||
| import socket | |||
| import time | |||
| import uuid | |||
| import requests | |||
| from enum import Enum, IntEnum | |||
| import importlib | |||
| from Cryptodome.PublicKey import RSA | |||
| from Cryptodome.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5 | |||
| from filelock import FileLock | |||
| from . import file_utils | |||
| SERVICE_CONF = "service_conf.yaml" | |||
| def conf_realpath(conf_name): | |||
| conf_path = f"conf/{conf_name}" | |||
| return os.path.join(file_utils.get_project_base_directory(), conf_path) | |||
| def get_base_config(key, default=None, conf_name=SERVICE_CONF) -> dict: | |||
| local_config = {} | |||
| local_path = conf_realpath(f'local.{conf_name}') | |||
| if default is None: | |||
| default = os.environ.get(key.upper()) | |||
| if os.path.exists(local_path): | |||
| local_config = file_utils.load_yaml_conf(local_path) | |||
| if not isinstance(local_config, dict): | |||
| raise ValueError(f'Invalid config file: "{local_path}".') | |||
| if key is not None and key in local_config: | |||
| return local_config[key] | |||
| config_path = conf_realpath(conf_name) | |||
| config = file_utils.load_yaml_conf(config_path) | |||
| if not isinstance(config, dict): | |||
| raise ValueError(f'Invalid config file: "{config_path}".') | |||
| config.update(local_config) | |||
| return config.get(key, default) if key is not None else config | |||
| use_deserialize_safe_module = get_base_config( | |||
| 'use_deserialize_safe_module', False) | |||
| class CoordinationCommunicationProtocol(object): | |||
| HTTP = "http" | |||
| GRPC = "grpc" | |||
| class BaseType: | |||
| def to_dict(self): | |||
| return dict([(k.lstrip("_"), v) for k, v in self.__dict__.items()]) | |||
| def to_dict_with_type(self): | |||
| def _dict(obj): | |||
| module = None | |||
| if issubclass(obj.__class__, BaseType): | |||
| data = {} | |||
| for attr, v in obj.__dict__.items(): | |||
| k = attr.lstrip("_") | |||
| data[k] = _dict(v) | |||
| module = obj.__module__ | |||
| elif isinstance(obj, (list, tuple)): | |||
| data = [] | |||
| for i, vv in enumerate(obj): | |||
| data.append(_dict(vv)) | |||
| elif isinstance(obj, dict): | |||
| data = {} | |||
| for _k, vv in obj.items(): | |||
| data[_k] = _dict(vv) | |||
| else: | |||
| data = obj | |||
| return {"type": obj.__class__.__name__, | |||
| "data": data, "module": module} | |||
| return _dict(self) | |||
| class CustomJSONEncoder(json.JSONEncoder): | |||
| def __init__(self, **kwargs): | |||
| self._with_type = kwargs.pop("with_type", False) | |||
| super().__init__(**kwargs) | |||
| def default(self, obj): | |||
| if isinstance(obj, datetime.datetime): | |||
| return obj.strftime('%Y-%m-%d %H:%M:%S') | |||
| elif isinstance(obj, datetime.date): | |||
| return obj.strftime('%Y-%m-%d') | |||
| elif isinstance(obj, datetime.timedelta): | |||
| return str(obj) | |||
| elif issubclass(type(obj), Enum) or issubclass(type(obj), IntEnum): | |||
| return obj.value | |||
| elif isinstance(obj, set): | |||
| return list(obj) | |||
| elif issubclass(type(obj), BaseType): | |||
| if not self._with_type: | |||
| return obj.to_dict() | |||
| else: | |||
| return obj.to_dict_with_type() | |||
| elif isinstance(obj, type): | |||
| return obj.__name__ | |||
| else: | |||
| return json.JSONEncoder.default(self, obj) | |||
| def rag_uuid(): | |||
| return uuid.uuid1().hex | |||
| def string_to_bytes(string): | |||
| return string if isinstance( | |||
| string, bytes) else string.encode(encoding="utf-8") | |||
| def bytes_to_string(byte): | |||
| return byte.decode(encoding="utf-8") | |||
| def json_dumps(src, byte=False, indent=None, with_type=False): | |||
| dest = json.dumps( | |||
| src, | |||
| indent=indent, | |||
| cls=CustomJSONEncoder, | |||
| with_type=with_type) | |||
| if byte: | |||
| dest = string_to_bytes(dest) | |||
| return dest | |||
| def json_loads(src, object_hook=None, object_pairs_hook=None): | |||
| if isinstance(src, bytes): | |||
| src = bytes_to_string(src) | |||
| return json.loads(src, object_hook=object_hook, | |||
| object_pairs_hook=object_pairs_hook) | |||
| def current_timestamp(): | |||
| return int(time.time() * 1000) | |||
| def timestamp_to_date(timestamp, format_string="%Y-%m-%d %H:%M:%S"): | |||
| if not timestamp: | |||
| timestamp = time.time() | |||
| timestamp = int(timestamp) / 1000 | |||
| time_array = time.localtime(timestamp) | |||
| str_date = time.strftime(format_string, time_array) | |||
| return str_date | |||
| def date_string_to_timestamp(time_str, format_string="%Y-%m-%d %H:%M:%S"): | |||
| time_array = time.strptime(time_str, format_string) | |||
| time_stamp = int(time.mktime(time_array) * 1000) | |||
| return time_stamp | |||
| def serialize_b64(src, to_str=False): | |||
| dest = base64.b64encode(pickle.dumps(src)) | |||
| if not to_str: | |||
| return dest | |||
| else: | |||
| return bytes_to_string(dest) | |||
| def deserialize_b64(src): | |||
| src = base64.b64decode( | |||
| string_to_bytes(src) if isinstance( | |||
| src, str) else src) | |||
| if use_deserialize_safe_module: | |||
| return restricted_loads(src) | |||
| return pickle.loads(src) | |||
| safe_module = { | |||
| 'numpy', | |||
| 'rag_flow' | |||
| } | |||
| class RestrictedUnpickler(pickle.Unpickler): | |||
| def find_class(self, module, name): | |||
| import importlib | |||
| if module.split('.')[0] in safe_module: | |||
| _module = importlib.import_module(module) | |||
| return getattr(_module, name) | |||
| # Forbid everything else. | |||
| raise pickle.UnpicklingError("global '%s.%s' is forbidden" % | |||
| (module, name)) | |||
| def restricted_loads(src): | |||
| """Helper function analogous to pickle.loads().""" | |||
| return RestrictedUnpickler(io.BytesIO(src)).load() | |||
| def get_lan_ip(): | |||
| if os.name != "nt": | |||
| import fcntl | |||
| import struct | |||
| def get_interface_ip(ifname): | |||
| s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) | |||
| return socket.inet_ntoa( | |||
| fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', string_to_bytes(ifname[:15])))[20:24]) | |||
| ip = socket.gethostbyname(socket.getfqdn()) | |||
| if ip.startswith("127.") and os.name != "nt": | |||
| interfaces = [ | |||
| "bond1", | |||
| "eth0", | |||
| "eth1", | |||
| "eth2", | |||
| "wlan0", | |||
| "wlan1", | |||
| "wifi0", | |||
| "ath0", | |||
| "ath1", | |||
| "ppp0", | |||
| ] | |||
| for ifname in interfaces: | |||
| try: | |||
| ip = get_interface_ip(ifname) | |||
| break | |||
| except IOError as e: | |||
| pass | |||
| return ip or '' | |||
| def from_dict_hook(in_dict: dict): | |||
| if "type" in in_dict and "data" in in_dict: | |||
| if in_dict["module"] is None: | |||
| return in_dict["data"] | |||
| else: | |||
| return getattr(importlib.import_module( | |||
| in_dict["module"]), in_dict["type"])(**in_dict["data"]) | |||
| else: | |||
| return in_dict | |||
| def decrypt_database_password(password): | |||
| encrypt_password = get_base_config("encrypt_password", False) | |||
| encrypt_module = get_base_config("encrypt_module", False) | |||
| private_key = get_base_config("private_key", None) | |||
| if not password or not encrypt_password: | |||
| return password | |||
| if not private_key: | |||
| raise ValueError("No private key") | |||
| module_fun = encrypt_module.split("#") | |||
| pwdecrypt_fun = getattr( | |||
| importlib.import_module( | |||
| module_fun[0]), | |||
| module_fun[1]) | |||
| return pwdecrypt_fun(private_key, password) | |||
| def decrypt_database_config( | |||
| database=None, passwd_key="password", name="database"): | |||
| if not database: | |||
| database = get_base_config(name, {}) | |||
| database[passwd_key] = decrypt_database_password(database[passwd_key]) | |||
| return database | |||
| def update_config(key, value, conf_name=SERVICE_CONF): | |||
| conf_path = conf_realpath(conf_name=conf_name) | |||
| if not os.path.isabs(conf_path): | |||
| conf_path = os.path.join( | |||
| file_utils.get_project_base_directory(), conf_path) | |||
| with FileLock(os.path.join(os.path.dirname(conf_path), ".lock")): | |||
| config = file_utils.load_yaml_conf(conf_path=conf_path) or {} | |||
| config[key] = value | |||
| file_utils.rewrite_yaml_conf(conf_path=conf_path, config=config) | |||
| def get_uuid(): | |||
| return uuid.uuid1().hex | |||
| def datetime_format(date_time: datetime.datetime) -> datetime.datetime: | |||
| return datetime.datetime(date_time.year, date_time.month, date_time.day, | |||
| date_time.hour, date_time.minute, date_time.second) | |||
| def get_format_time() -> datetime.datetime: | |||
| return datetime_format(datetime.datetime.now()) | |||
| def str2date(date_time: str): | |||
| return datetime.datetime.strptime(date_time, '%Y-%m-%d') | |||
| def elapsed2time(elapsed): | |||
| seconds = elapsed / 1000 | |||
| minuter, second = divmod(seconds, 60) | |||
| hour, minuter = divmod(minuter, 60) | |||
| return '%02d:%02d:%02d' % (hour, minuter, second) | |||
| def decrypt(line): | |||
| file_path = os.path.join( | |||
| file_utils.get_project_base_directory(), | |||
| "conf", | |||
| "private.pem") | |||
| rsa_key = RSA.importKey(open(file_path).read(), "Welcome") | |||
| cipher = Cipher_pkcs1_v1_5.new(rsa_key) | |||
| return cipher.decrypt(base64.b64decode( | |||
| line), "Fail to decrypt password!").decode('utf-8') | |||
| def download_img(url): | |||
| if not url: | |||
| return "" | |||
| response = requests.get(url) | |||
| return "data:" + \ | |||
| response.headers.get('Content-Type', 'image/jpg') + ";" + \ | |||
| "base64," + base64.b64encode(response.content).decode("utf-8") | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import base64 | |||
| import datetime | |||
| import io | |||
| import json | |||
| import os | |||
| import pickle | |||
| import socket | |||
| import time | |||
| import uuid | |||
| import requests | |||
| from enum import Enum, IntEnum | |||
| import importlib | |||
| from Cryptodome.PublicKey import RSA | |||
| from Cryptodome.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5 | |||
| from filelock import FileLock | |||
| from . import file_utils | |||
| SERVICE_CONF = "service_conf.yaml" | |||
| def conf_realpath(conf_name): | |||
| conf_path = f"conf/{conf_name}" | |||
| return os.path.join(file_utils.get_project_base_directory(), conf_path) | |||
| def get_base_config(key, default=None, conf_name=SERVICE_CONF) -> dict: | |||
| local_config = {} | |||
| local_path = conf_realpath(f'local.{conf_name}') | |||
| if default is None: | |||
| default = os.environ.get(key.upper()) | |||
| if os.path.exists(local_path): | |||
| local_config = file_utils.load_yaml_conf(local_path) | |||
| if not isinstance(local_config, dict): | |||
| raise ValueError(f'Invalid config file: "{local_path}".') | |||
| if key is not None and key in local_config: | |||
| return local_config[key] | |||
| config_path = conf_realpath(conf_name) | |||
| config = file_utils.load_yaml_conf(config_path) | |||
| if not isinstance(config, dict): | |||
| raise ValueError(f'Invalid config file: "{config_path}".') | |||
| config.update(local_config) | |||
| return config.get(key, default) if key is not None else config | |||
| use_deserialize_safe_module = get_base_config( | |||
| 'use_deserialize_safe_module', False) | |||
| class CoordinationCommunicationProtocol(object): | |||
| HTTP = "http" | |||
| GRPC = "grpc" | |||
| class BaseType: | |||
| def to_dict(self): | |||
| return dict([(k.lstrip("_"), v) for k, v in self.__dict__.items()]) | |||
| def to_dict_with_type(self): | |||
| def _dict(obj): | |||
| module = None | |||
| if issubclass(obj.__class__, BaseType): | |||
| data = {} | |||
| for attr, v in obj.__dict__.items(): | |||
| k = attr.lstrip("_") | |||
| data[k] = _dict(v) | |||
| module = obj.__module__ | |||
| elif isinstance(obj, (list, tuple)): | |||
| data = [] | |||
| for i, vv in enumerate(obj): | |||
| data.append(_dict(vv)) | |||
| elif isinstance(obj, dict): | |||
| data = {} | |||
| for _k, vv in obj.items(): | |||
| data[_k] = _dict(vv) | |||
| else: | |||
| data = obj | |||
| return {"type": obj.__class__.__name__, | |||
| "data": data, "module": module} | |||
| return _dict(self) | |||
| class CustomJSONEncoder(json.JSONEncoder): | |||
| def __init__(self, **kwargs): | |||
| self._with_type = kwargs.pop("with_type", False) | |||
| super().__init__(**kwargs) | |||
| def default(self, obj): | |||
| if isinstance(obj, datetime.datetime): | |||
| return obj.strftime('%Y-%m-%d %H:%M:%S') | |||
| elif isinstance(obj, datetime.date): | |||
| return obj.strftime('%Y-%m-%d') | |||
| elif isinstance(obj, datetime.timedelta): | |||
| return str(obj) | |||
| elif issubclass(type(obj), Enum) or issubclass(type(obj), IntEnum): | |||
| return obj.value | |||
| elif isinstance(obj, set): | |||
| return list(obj) | |||
| elif issubclass(type(obj), BaseType): | |||
| if not self._with_type: | |||
| return obj.to_dict() | |||
| else: | |||
| return obj.to_dict_with_type() | |||
| elif isinstance(obj, type): | |||
| return obj.__name__ | |||
| else: | |||
| return json.JSONEncoder.default(self, obj) | |||
| def rag_uuid(): | |||
| return uuid.uuid1().hex | |||
| def string_to_bytes(string): | |||
| return string if isinstance( | |||
| string, bytes) else string.encode(encoding="utf-8") | |||
| def bytes_to_string(byte): | |||
| return byte.decode(encoding="utf-8") | |||
| def json_dumps(src, byte=False, indent=None, with_type=False): | |||
| dest = json.dumps( | |||
| src, | |||
| indent=indent, | |||
| cls=CustomJSONEncoder, | |||
| with_type=with_type) | |||
| if byte: | |||
| dest = string_to_bytes(dest) | |||
| return dest | |||
| def json_loads(src, object_hook=None, object_pairs_hook=None): | |||
| if isinstance(src, bytes): | |||
| src = bytes_to_string(src) | |||
| return json.loads(src, object_hook=object_hook, | |||
| object_pairs_hook=object_pairs_hook) | |||
| def current_timestamp(): | |||
| return int(time.time() * 1000) | |||
| def timestamp_to_date(timestamp, format_string="%Y-%m-%d %H:%M:%S"): | |||
| if not timestamp: | |||
| timestamp = time.time() | |||
| timestamp = int(timestamp) / 1000 | |||
| time_array = time.localtime(timestamp) | |||
| str_date = time.strftime(format_string, time_array) | |||
| return str_date | |||
| def date_string_to_timestamp(time_str, format_string="%Y-%m-%d %H:%M:%S"): | |||
| time_array = time.strptime(time_str, format_string) | |||
| time_stamp = int(time.mktime(time_array) * 1000) | |||
| return time_stamp | |||
| def serialize_b64(src, to_str=False): | |||
| dest = base64.b64encode(pickle.dumps(src)) | |||
| if not to_str: | |||
| return dest | |||
| else: | |||
| return bytes_to_string(dest) | |||
| def deserialize_b64(src): | |||
| src = base64.b64decode( | |||
| string_to_bytes(src) if isinstance( | |||
| src, str) else src) | |||
| if use_deserialize_safe_module: | |||
| return restricted_loads(src) | |||
| return pickle.loads(src) | |||
| safe_module = { | |||
| 'numpy', | |||
| 'rag_flow' | |||
| } | |||
| class RestrictedUnpickler(pickle.Unpickler): | |||
| def find_class(self, module, name): | |||
| import importlib | |||
| if module.split('.')[0] in safe_module: | |||
| _module = importlib.import_module(module) | |||
| return getattr(_module, name) | |||
| # Forbid everything else. | |||
| raise pickle.UnpicklingError("global '%s.%s' is forbidden" % | |||
| (module, name)) | |||
| def restricted_loads(src): | |||
| """Helper function analogous to pickle.loads().""" | |||
| return RestrictedUnpickler(io.BytesIO(src)).load() | |||
| def get_lan_ip(): | |||
| if os.name != "nt": | |||
| import fcntl | |||
| import struct | |||
| def get_interface_ip(ifname): | |||
| s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) | |||
| return socket.inet_ntoa( | |||
| fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', string_to_bytes(ifname[:15])))[20:24]) | |||
| ip = socket.gethostbyname(socket.getfqdn()) | |||
| if ip.startswith("127.") and os.name != "nt": | |||
| interfaces = [ | |||
| "bond1", | |||
| "eth0", | |||
| "eth1", | |||
| "eth2", | |||
| "wlan0", | |||
| "wlan1", | |||
| "wifi0", | |||
| "ath0", | |||
| "ath1", | |||
| "ppp0", | |||
| ] | |||
| for ifname in interfaces: | |||
| try: | |||
| ip = get_interface_ip(ifname) | |||
| break | |||
| except IOError as e: | |||
| pass | |||
| return ip or '' | |||
| def from_dict_hook(in_dict: dict): | |||
| if "type" in in_dict and "data" in in_dict: | |||
| if in_dict["module"] is None: | |||
| return in_dict["data"] | |||
| else: | |||
| return getattr(importlib.import_module( | |||
| in_dict["module"]), in_dict["type"])(**in_dict["data"]) | |||
| else: | |||
| return in_dict | |||
| def decrypt_database_password(password): | |||
| encrypt_password = get_base_config("encrypt_password", False) | |||
| encrypt_module = get_base_config("encrypt_module", False) | |||
| private_key = get_base_config("private_key", None) | |||
| if not password or not encrypt_password: | |||
| return password | |||
| if not private_key: | |||
| raise ValueError("No private key") | |||
| module_fun = encrypt_module.split("#") | |||
| pwdecrypt_fun = getattr( | |||
| importlib.import_module( | |||
| module_fun[0]), | |||
| module_fun[1]) | |||
| return pwdecrypt_fun(private_key, password) | |||
| def decrypt_database_config( | |||
| database=None, passwd_key="password", name="database"): | |||
| if not database: | |||
| database = get_base_config(name, {}) | |||
| database[passwd_key] = decrypt_database_password(database[passwd_key]) | |||
| return database | |||
| def update_config(key, value, conf_name=SERVICE_CONF): | |||
| conf_path = conf_realpath(conf_name=conf_name) | |||
| if not os.path.isabs(conf_path): | |||
| conf_path = os.path.join( | |||
| file_utils.get_project_base_directory(), conf_path) | |||
| with FileLock(os.path.join(os.path.dirname(conf_path), ".lock")): | |||
| config = file_utils.load_yaml_conf(conf_path=conf_path) or {} | |||
| config[key] = value | |||
| file_utils.rewrite_yaml_conf(conf_path=conf_path, config=config) | |||
| def get_uuid(): | |||
| return uuid.uuid1().hex | |||
| def datetime_format(date_time: datetime.datetime) -> datetime.datetime: | |||
| return datetime.datetime(date_time.year, date_time.month, date_time.day, | |||
| date_time.hour, date_time.minute, date_time.second) | |||
| def get_format_time() -> datetime.datetime: | |||
| return datetime_format(datetime.datetime.now()) | |||
| def str2date(date_time: str): | |||
| return datetime.datetime.strptime(date_time, '%Y-%m-%d') | |||
| def elapsed2time(elapsed): | |||
| seconds = elapsed / 1000 | |||
| minuter, second = divmod(seconds, 60) | |||
| hour, minuter = divmod(minuter, 60) | |||
| return '%02d:%02d:%02d' % (hour, minuter, second) | |||
| def decrypt(line): | |||
| file_path = os.path.join( | |||
| file_utils.get_project_base_directory(), | |||
| "conf", | |||
| "private.pem") | |||
| rsa_key = RSA.importKey(open(file_path).read(), "Welcome") | |||
| cipher = Cipher_pkcs1_v1_5.new(rsa_key) | |||
| return cipher.decrypt(base64.b64decode( | |||
| line), "Fail to decrypt password!").decode('utf-8') | |||
| def download_img(url): | |||
| if not url: | |||
| return "" | |||
| response = requests.get(url) | |||
| return "data:" + \ | |||
| response.headers.get('Content-Type', 'image/jpg') + ";" + \ | |||
| "base64," + base64.b64encode(response.content).decode("utf-8") | |||
| @@ -1,269 +1,269 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| import random | |||
| import time | |||
| from functools import wraps | |||
| from io import BytesIO | |||
| from flask import ( | |||
| Response, jsonify, send_file, make_response, | |||
| request as flask_request, | |||
| ) | |||
| from werkzeug.http import HTTP_STATUS_CODES | |||
| from api.utils import json_dumps | |||
| from api.settings import RetCode | |||
| from api.settings import ( | |||
| REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC, | |||
| stat_logger, CLIENT_AUTHENTICATION, HTTP_APP_KEY, SECRET_KEY | |||
| ) | |||
| import requests | |||
| import functools | |||
| from api.utils import CustomJSONEncoder | |||
| from uuid import uuid1 | |||
| from base64 import b64encode | |||
| from hmac import HMAC | |||
| from urllib.parse import quote, urlencode | |||
| requests.models.complexjson.dumps = functools.partial( | |||
| json.dumps, cls=CustomJSONEncoder) | |||
| def request(**kwargs): | |||
| sess = requests.Session() | |||
| stream = kwargs.pop('stream', sess.stream) | |||
| timeout = kwargs.pop('timeout', None) | |||
| kwargs['headers'] = { | |||
| k.replace( | |||
| '_', | |||
| '-').upper(): v for k, | |||
| v in kwargs.get( | |||
| 'headers', | |||
| {}).items()} | |||
| prepped = requests.Request(**kwargs).prepare() | |||
| if CLIENT_AUTHENTICATION and HTTP_APP_KEY and SECRET_KEY: | |||
| timestamp = str(round(time() * 1000)) | |||
| nonce = str(uuid1()) | |||
| signature = b64encode(HMAC(SECRET_KEY.encode('ascii'), b'\n'.join([ | |||
| timestamp.encode('ascii'), | |||
| nonce.encode('ascii'), | |||
| HTTP_APP_KEY.encode('ascii'), | |||
| prepped.path_url.encode('ascii'), | |||
| prepped.body if kwargs.get('json') else b'', | |||
| urlencode( | |||
| sorted( | |||
| kwargs['data'].items()), | |||
| quote_via=quote, | |||
| safe='-._~').encode('ascii') | |||
| if kwargs.get('data') and isinstance(kwargs['data'], dict) else b'', | |||
| ]), 'sha1').digest()).decode('ascii') | |||
| prepped.headers.update({ | |||
| 'TIMESTAMP': timestamp, | |||
| 'NONCE': nonce, | |||
| 'APP-KEY': HTTP_APP_KEY, | |||
| 'SIGNATURE': signature, | |||
| }) | |||
| return sess.send(prepped, stream=stream, timeout=timeout) | |||
| def get_exponential_backoff_interval(retries, full_jitter=False): | |||
| """Calculate the exponential backoff wait time.""" | |||
| # Will be zero if factor equals 0 | |||
| countdown = min(REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC * (2 ** retries)) | |||
| # Full jitter according to | |||
| # https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ | |||
| if full_jitter: | |||
| countdown = random.randrange(countdown + 1) | |||
| # Adjust according to maximum wait time and account for negative values. | |||
| return max(0, countdown) | |||
| def get_json_result(retcode=RetCode.SUCCESS, retmsg='success', | |||
| data=None, job_id=None, meta=None): | |||
| import re | |||
| result_dict = { | |||
| "retcode": retcode, | |||
| "retmsg": retmsg, | |||
| # "retmsg": re.sub(r"rag", "seceum", retmsg, flags=re.IGNORECASE), | |||
| "data": data, | |||
| "jobId": job_id, | |||
| "meta": meta, | |||
| } | |||
| response = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "retcode": | |||
| continue | |||
| else: | |||
| response[key] = value | |||
| return jsonify(response) | |||
| def get_data_error_result(retcode=RetCode.DATA_ERROR, | |||
| retmsg='Sorry! Data missing!'): | |||
| import re | |||
| result_dict = { | |||
| "retcode": retcode, | |||
| "retmsg": re.sub( | |||
| r"rag", | |||
| "seceum", | |||
| retmsg, | |||
| flags=re.IGNORECASE)} | |||
| response = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "retcode": | |||
| continue | |||
| else: | |||
| response[key] = value | |||
| return jsonify(response) | |||
| def server_error_response(e): | |||
| stat_logger.exception(e) | |||
| try: | |||
| if e.code == 401: | |||
| return get_json_result(retcode=401, retmsg=repr(e)) | |||
| except BaseException: | |||
| pass | |||
| if len(e.args) > 1: | |||
| return get_json_result( | |||
| retcode=RetCode.EXCEPTION_ERROR, retmsg=repr(e.args[0]), data=e.args[1]) | |||
| if repr(e).find("index_not_found_exception") >= 0: | |||
| return get_json_result(retcode=RetCode.EXCEPTION_ERROR, retmsg="No chunk found, please upload file and parse it.") | |||
| return get_json_result(retcode=RetCode.EXCEPTION_ERROR, retmsg=repr(e)) | |||
| def error_response(response_code, retmsg=None): | |||
| if retmsg is None: | |||
| retmsg = HTTP_STATUS_CODES.get(response_code, 'Unknown Error') | |||
| return Response(json.dumps({ | |||
| 'retmsg': retmsg, | |||
| 'retcode': response_code, | |||
| }), status=response_code, mimetype='application/json') | |||
| def validate_request(*args, **kwargs): | |||
| def wrapper(func): | |||
| @wraps(func) | |||
| def decorated_function(*_args, **_kwargs): | |||
| input_arguments = flask_request.json or flask_request.form.to_dict() | |||
| no_arguments = [] | |||
| error_arguments = [] | |||
| for arg in args: | |||
| if arg not in input_arguments: | |||
| no_arguments.append(arg) | |||
| for k, v in kwargs.items(): | |||
| config_value = input_arguments.get(k, None) | |||
| if config_value is None: | |||
| no_arguments.append(k) | |||
| elif isinstance(v, (tuple, list)): | |||
| if config_value not in v: | |||
| error_arguments.append((k, set(v))) | |||
| elif config_value != v: | |||
| error_arguments.append((k, v)) | |||
| if no_arguments or error_arguments: | |||
| error_string = "" | |||
| if no_arguments: | |||
| error_string += "required argument are missing: {}; ".format( | |||
| ",".join(no_arguments)) | |||
| if error_arguments: | |||
| error_string += "required argument values: {}".format( | |||
| ",".join(["{}={}".format(a[0], a[1]) for a in error_arguments])) | |||
| return get_json_result( | |||
| retcode=RetCode.ARGUMENT_ERROR, retmsg=error_string) | |||
| return func(*_args, **_kwargs) | |||
| return decorated_function | |||
| return wrapper | |||
| def is_localhost(ip): | |||
| return ip in {'127.0.0.1', '::1', '[::1]', 'localhost'} | |||
| def send_file_in_mem(data, filename): | |||
| if not isinstance(data, (str, bytes)): | |||
| data = json_dumps(data) | |||
| if isinstance(data, str): | |||
| data = data.encode('utf-8') | |||
| f = BytesIO() | |||
| f.write(data) | |||
| f.seek(0) | |||
| return send_file(f, as_attachment=True, attachment_filename=filename) | |||
| def get_json_result(retcode=RetCode.SUCCESS, retmsg='success', data=None): | |||
| response = {"retcode": retcode, "retmsg": retmsg, "data": data} | |||
| return jsonify(response) | |||
| def cors_reponse(retcode=RetCode.SUCCESS, | |||
| retmsg='success', data=None, auth=None): | |||
| result_dict = {"retcode": retcode, "retmsg": retmsg, "data": data} | |||
| response_dict = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "retcode": | |||
| continue | |||
| else: | |||
| response_dict[key] = value | |||
| response = make_response(jsonify(response_dict)) | |||
| if auth: | |||
| response.headers["Authorization"] = auth | |||
| response.headers["Access-Control-Allow-Origin"] = "*" | |||
| response.headers["Access-Control-Allow-Method"] = "*" | |||
| response.headers["Access-Control-Allow-Headers"] = "*" | |||
| response.headers["Access-Control-Allow-Headers"] = "*" | |||
| response.headers["Access-Control-Expose-Headers"] = "Authorization" | |||
| return response | |||
| def construct_result(code=RetCode.DATA_ERROR, message='data is missing'): | |||
| import re | |||
| result_dict = {"code": code, "message": re.sub(r"rag", "seceum", message, flags=re.IGNORECASE)} | |||
| response = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "code": | |||
| continue | |||
| else: | |||
| response[key] = value | |||
| return jsonify(response) | |||
| def construct_json_result(code=RetCode.SUCCESS, message='success', data=None): | |||
| if data is None: | |||
| return jsonify({"code": code, "message": message}) | |||
| else: | |||
| return jsonify({"code": code, "message": message, "data": data}) | |||
| def construct_error_response(e): | |||
| stat_logger.exception(e) | |||
| try: | |||
| if e.code == 401: | |||
| return construct_json_result(code=RetCode.UNAUTHORIZED, message=repr(e)) | |||
| except BaseException: | |||
| pass | |||
| if len(e.args) > 1: | |||
| return construct_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1]) | |||
| if repr(e).find("index_not_found_exception") >=0: | |||
| return construct_json_result(code=RetCode.EXCEPTION_ERROR, message="No chunk found, please upload file and parse it.") | |||
| return construct_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e)) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| import random | |||
| import time | |||
| from functools import wraps | |||
| from io import BytesIO | |||
| from flask import ( | |||
| Response, jsonify, send_file, make_response, | |||
| request as flask_request, | |||
| ) | |||
| from werkzeug.http import HTTP_STATUS_CODES | |||
| from api.utils import json_dumps | |||
| from api.settings import RetCode | |||
| from api.settings import ( | |||
| REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC, | |||
| stat_logger, CLIENT_AUTHENTICATION, HTTP_APP_KEY, SECRET_KEY | |||
| ) | |||
| import requests | |||
| import functools | |||
| from api.utils import CustomJSONEncoder | |||
| from uuid import uuid1 | |||
| from base64 import b64encode | |||
| from hmac import HMAC | |||
| from urllib.parse import quote, urlencode | |||
| requests.models.complexjson.dumps = functools.partial( | |||
| json.dumps, cls=CustomJSONEncoder) | |||
| def request(**kwargs): | |||
| sess = requests.Session() | |||
| stream = kwargs.pop('stream', sess.stream) | |||
| timeout = kwargs.pop('timeout', None) | |||
| kwargs['headers'] = { | |||
| k.replace( | |||
| '_', | |||
| '-').upper(): v for k, | |||
| v in kwargs.get( | |||
| 'headers', | |||
| {}).items()} | |||
| prepped = requests.Request(**kwargs).prepare() | |||
| if CLIENT_AUTHENTICATION and HTTP_APP_KEY and SECRET_KEY: | |||
| timestamp = str(round(time() * 1000)) | |||
| nonce = str(uuid1()) | |||
| signature = b64encode(HMAC(SECRET_KEY.encode('ascii'), b'\n'.join([ | |||
| timestamp.encode('ascii'), | |||
| nonce.encode('ascii'), | |||
| HTTP_APP_KEY.encode('ascii'), | |||
| prepped.path_url.encode('ascii'), | |||
| prepped.body if kwargs.get('json') else b'', | |||
| urlencode( | |||
| sorted( | |||
| kwargs['data'].items()), | |||
| quote_via=quote, | |||
| safe='-._~').encode('ascii') | |||
| if kwargs.get('data') and isinstance(kwargs['data'], dict) else b'', | |||
| ]), 'sha1').digest()).decode('ascii') | |||
| prepped.headers.update({ | |||
| 'TIMESTAMP': timestamp, | |||
| 'NONCE': nonce, | |||
| 'APP-KEY': HTTP_APP_KEY, | |||
| 'SIGNATURE': signature, | |||
| }) | |||
| return sess.send(prepped, stream=stream, timeout=timeout) | |||
| def get_exponential_backoff_interval(retries, full_jitter=False): | |||
| """Calculate the exponential backoff wait time.""" | |||
| # Will be zero if factor equals 0 | |||
| countdown = min(REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC * (2 ** retries)) | |||
| # Full jitter according to | |||
| # https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ | |||
| if full_jitter: | |||
| countdown = random.randrange(countdown + 1) | |||
| # Adjust according to maximum wait time and account for negative values. | |||
| return max(0, countdown) | |||
| def get_json_result(retcode=RetCode.SUCCESS, retmsg='success', | |||
| data=None, job_id=None, meta=None): | |||
| import re | |||
| result_dict = { | |||
| "retcode": retcode, | |||
| "retmsg": retmsg, | |||
| # "retmsg": re.sub(r"rag", "seceum", retmsg, flags=re.IGNORECASE), | |||
| "data": data, | |||
| "jobId": job_id, | |||
| "meta": meta, | |||
| } | |||
| response = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "retcode": | |||
| continue | |||
| else: | |||
| response[key] = value | |||
| return jsonify(response) | |||
| def get_data_error_result(retcode=RetCode.DATA_ERROR, | |||
| retmsg='Sorry! Data missing!'): | |||
| import re | |||
| result_dict = { | |||
| "retcode": retcode, | |||
| "retmsg": re.sub( | |||
| r"rag", | |||
| "seceum", | |||
| retmsg, | |||
| flags=re.IGNORECASE)} | |||
| response = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "retcode": | |||
| continue | |||
| else: | |||
| response[key] = value | |||
| return jsonify(response) | |||
| def server_error_response(e): | |||
| stat_logger.exception(e) | |||
| try: | |||
| if e.code == 401: | |||
| return get_json_result(retcode=401, retmsg=repr(e)) | |||
| except BaseException: | |||
| pass | |||
| if len(e.args) > 1: | |||
| return get_json_result( | |||
| retcode=RetCode.EXCEPTION_ERROR, retmsg=repr(e.args[0]), data=e.args[1]) | |||
| if repr(e).find("index_not_found_exception") >= 0: | |||
| return get_json_result(retcode=RetCode.EXCEPTION_ERROR, retmsg="No chunk found, please upload file and parse it.") | |||
| return get_json_result(retcode=RetCode.EXCEPTION_ERROR, retmsg=repr(e)) | |||
| def error_response(response_code, retmsg=None): | |||
| if retmsg is None: | |||
| retmsg = HTTP_STATUS_CODES.get(response_code, 'Unknown Error') | |||
| return Response(json.dumps({ | |||
| 'retmsg': retmsg, | |||
| 'retcode': response_code, | |||
| }), status=response_code, mimetype='application/json') | |||
| def validate_request(*args, **kwargs): | |||
| def wrapper(func): | |||
| @wraps(func) | |||
| def decorated_function(*_args, **_kwargs): | |||
| input_arguments = flask_request.json or flask_request.form.to_dict() | |||
| no_arguments = [] | |||
| error_arguments = [] | |||
| for arg in args: | |||
| if arg not in input_arguments: | |||
| no_arguments.append(arg) | |||
| for k, v in kwargs.items(): | |||
| config_value = input_arguments.get(k, None) | |||
| if config_value is None: | |||
| no_arguments.append(k) | |||
| elif isinstance(v, (tuple, list)): | |||
| if config_value not in v: | |||
| error_arguments.append((k, set(v))) | |||
| elif config_value != v: | |||
| error_arguments.append((k, v)) | |||
| if no_arguments or error_arguments: | |||
| error_string = "" | |||
| if no_arguments: | |||
| error_string += "required argument are missing: {}; ".format( | |||
| ",".join(no_arguments)) | |||
| if error_arguments: | |||
| error_string += "required argument values: {}".format( | |||
| ",".join(["{}={}".format(a[0], a[1]) for a in error_arguments])) | |||
| return get_json_result( | |||
| retcode=RetCode.ARGUMENT_ERROR, retmsg=error_string) | |||
| return func(*_args, **_kwargs) | |||
| return decorated_function | |||
| return wrapper | |||
| def is_localhost(ip): | |||
| return ip in {'127.0.0.1', '::1', '[::1]', 'localhost'} | |||
| def send_file_in_mem(data, filename): | |||
| if not isinstance(data, (str, bytes)): | |||
| data = json_dumps(data) | |||
| if isinstance(data, str): | |||
| data = data.encode('utf-8') | |||
| f = BytesIO() | |||
| f.write(data) | |||
| f.seek(0) | |||
| return send_file(f, as_attachment=True, attachment_filename=filename) | |||
| def get_json_result(retcode=RetCode.SUCCESS, retmsg='success', data=None): | |||
| response = {"retcode": retcode, "retmsg": retmsg, "data": data} | |||
| return jsonify(response) | |||
| def cors_reponse(retcode=RetCode.SUCCESS, | |||
| retmsg='success', data=None, auth=None): | |||
| result_dict = {"retcode": retcode, "retmsg": retmsg, "data": data} | |||
| response_dict = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "retcode": | |||
| continue | |||
| else: | |||
| response_dict[key] = value | |||
| response = make_response(jsonify(response_dict)) | |||
| if auth: | |||
| response.headers["Authorization"] = auth | |||
| response.headers["Access-Control-Allow-Origin"] = "*" | |||
| response.headers["Access-Control-Allow-Method"] = "*" | |||
| response.headers["Access-Control-Allow-Headers"] = "*" | |||
| response.headers["Access-Control-Allow-Headers"] = "*" | |||
| response.headers["Access-Control-Expose-Headers"] = "Authorization" | |||
| return response | |||
| def construct_result(code=RetCode.DATA_ERROR, message='data is missing'): | |||
| import re | |||
| result_dict = {"code": code, "message": re.sub(r"rag", "seceum", message, flags=re.IGNORECASE)} | |||
| response = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "code": | |||
| continue | |||
| else: | |||
| response[key] = value | |||
| return jsonify(response) | |||
| def construct_json_result(code=RetCode.SUCCESS, message='success', data=None): | |||
| if data is None: | |||
| return jsonify({"code": code, "message": message}) | |||
| else: | |||
| return jsonify({"code": code, "message": message, "data": data}) | |||
| def construct_error_response(e): | |||
| stat_logger.exception(e) | |||
| try: | |||
| if e.code == 401: | |||
| return construct_json_result(code=RetCode.UNAUTHORIZED, message=repr(e)) | |||
| except BaseException: | |||
| pass | |||
| if len(e.args) > 1: | |||
| return construct_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1]) | |||
| if repr(e).find("index_not_found_exception") >=0: | |||
| return construct_json_result(code=RetCode.EXCEPTION_ERROR, message="No chunk found, please upload file and parse it.") | |||
| return construct_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e)) | |||
| @@ -1,78 +1,78 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import base64 | |||
| import click | |||
| import re | |||
| from flask import Flask | |||
| from werkzeug.security import generate_password_hash | |||
| from api.db.services import UserService | |||
| @click.command('reset-password', help='Reset the account password.') | |||
| @click.option('--email', prompt=True, help='The email address of the account whose password you need to reset') | |||
| @click.option('--new-password', prompt=True, help='the new password.') | |||
| @click.option('--password-confirm', prompt=True, help='the new password confirm.') | |||
| def reset_password(email, new_password, password_confirm): | |||
| if str(new_password).strip() != str(password_confirm).strip(): | |||
| click.echo(click.style('sorry. The two passwords do not match.', fg='red')) | |||
| return | |||
| user = UserService.query(email=email) | |||
| if not user: | |||
| click.echo(click.style('sorry. The Email is not registered!.', fg='red')) | |||
| return | |||
| encode_password = base64.b64encode(new_password.encode('utf-8')).decode('utf-8') | |||
| password_hash = generate_password_hash(encode_password) | |||
| user_dict = { | |||
| 'password': password_hash | |||
| } | |||
| UserService.update_user(user[0].id,user_dict) | |||
| click.echo(click.style('Congratulations! Password has been reset.', fg='green')) | |||
| @click.command('reset-email', help='Reset the account email.') | |||
| @click.option('--email', prompt=True, help='The old email address of the account whose email you need to reset') | |||
| @click.option('--new-email', prompt=True, help='the new email.') | |||
| @click.option('--email-confirm', prompt=True, help='the new email confirm.') | |||
| def reset_email(email, new_email, email_confirm): | |||
| if str(new_email).strip() != str(email_confirm).strip(): | |||
| click.echo(click.style('Sorry, new email and confirm email do not match.', fg='red')) | |||
| return | |||
| if str(new_email).strip() == str(email).strip(): | |||
| click.echo(click.style('Sorry, new email and old email are the same.', fg='red')) | |||
| return | |||
| user = UserService.query(email=email) | |||
| if not user: | |||
| click.echo(click.style('sorry. the account: [{}] not exist .'.format(email), fg='red')) | |||
| return | |||
| if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,4}$", new_email): | |||
| click.echo(click.style('sorry. {} is not a valid email. '.format(new_email), fg='red')) | |||
| return | |||
| new_user = UserService.query(email=new_email) | |||
| if new_user: | |||
| click.echo(click.style('sorry. the account: [{}] is exist .'.format(new_email), fg='red')) | |||
| return | |||
| user_dict = { | |||
| 'email': new_email | |||
| } | |||
| UserService.update_user(user[0].id,user_dict) | |||
| click.echo(click.style('Congratulations!, email has been reset.', fg='green')) | |||
| def register_commands(app: Flask): | |||
| app.cli.add_command(reset_password) | |||
| app.cli.add_command(reset_email) | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import base64 | |||
| import click | |||
| import re | |||
| from flask import Flask | |||
| from werkzeug.security import generate_password_hash | |||
| from api.db.services import UserService | |||
| @click.command('reset-password', help='Reset the account password.') | |||
| @click.option('--email', prompt=True, help='The email address of the account whose password you need to reset') | |||
| @click.option('--new-password', prompt=True, help='the new password.') | |||
| @click.option('--password-confirm', prompt=True, help='the new password confirm.') | |||
| def reset_password(email, new_password, password_confirm): | |||
| if str(new_password).strip() != str(password_confirm).strip(): | |||
| click.echo(click.style('sorry. The two passwords do not match.', fg='red')) | |||
| return | |||
| user = UserService.query(email=email) | |||
| if not user: | |||
| click.echo(click.style('sorry. The Email is not registered!.', fg='red')) | |||
| return | |||
| encode_password = base64.b64encode(new_password.encode('utf-8')).decode('utf-8') | |||
| password_hash = generate_password_hash(encode_password) | |||
| user_dict = { | |||
| 'password': password_hash | |||
| } | |||
| UserService.update_user(user[0].id,user_dict) | |||
| click.echo(click.style('Congratulations! Password has been reset.', fg='green')) | |||
| @click.command('reset-email', help='Reset the account email.') | |||
| @click.option('--email', prompt=True, help='The old email address of the account whose email you need to reset') | |||
| @click.option('--new-email', prompt=True, help='the new email.') | |||
| @click.option('--email-confirm', prompt=True, help='the new email confirm.') | |||
| def reset_email(email, new_email, email_confirm): | |||
| if str(new_email).strip() != str(email_confirm).strip(): | |||
| click.echo(click.style('Sorry, new email and confirm email do not match.', fg='red')) | |||
| return | |||
| if str(new_email).strip() == str(email).strip(): | |||
| click.echo(click.style('Sorry, new email and old email are the same.', fg='red')) | |||
| return | |||
| user = UserService.query(email=email) | |||
| if not user: | |||
| click.echo(click.style('sorry. the account: [{}] not exist .'.format(email), fg='red')) | |||
| return | |||
| if not re.match(r"^[\w\._-]+@([\w_-]+\.)+[\w-]{2,4}$", new_email): | |||
| click.echo(click.style('sorry. {} is not a valid email. '.format(new_email), fg='red')) | |||
| return | |||
| new_user = UserService.query(email=new_email) | |||
| if new_user: | |||
| click.echo(click.style('sorry. the account: [{}] is exist .'.format(new_email), fg='red')) | |||
| return | |||
| user_dict = { | |||
| 'email': new_email | |||
| } | |||
| UserService.update_user(user[0].id,user_dict) | |||
| click.echo(click.style('Congratulations!, email has been reset.', fg='green')) | |||
| def register_commands(app: Flask): | |||
| app.cli.add_command(reset_password) | |||
| app.cli.add_command(reset_email) | |||
| @@ -1,207 +1,207 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import base64 | |||
| import json | |||
| import os | |||
| import re | |||
| from io import BytesIO | |||
| import pdfplumber | |||
| from PIL import Image | |||
| from cachetools import LRUCache, cached | |||
| from ruamel.yaml import YAML | |||
| from api.db import FileType | |||
| PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE") | |||
| RAG_BASE = os.getenv("RAG_BASE") | |||
| def get_project_base_directory(*args): | |||
| global PROJECT_BASE | |||
| if PROJECT_BASE is None: | |||
| PROJECT_BASE = os.path.abspath( | |||
| os.path.join( | |||
| os.path.dirname(os.path.realpath(__file__)), | |||
| os.pardir, | |||
| os.pardir, | |||
| ) | |||
| ) | |||
| if args: | |||
| return os.path.join(PROJECT_BASE, *args) | |||
| return PROJECT_BASE | |||
| def get_rag_directory(*args): | |||
| global RAG_BASE | |||
| if RAG_BASE is None: | |||
| RAG_BASE = os.path.abspath( | |||
| os.path.join( | |||
| os.path.dirname(os.path.realpath(__file__)), | |||
| os.pardir, | |||
| os.pardir, | |||
| os.pardir, | |||
| ) | |||
| ) | |||
| if args: | |||
| return os.path.join(RAG_BASE, *args) | |||
| return RAG_BASE | |||
| def get_rag_python_directory(*args): | |||
| return get_rag_directory("python", *args) | |||
| def get_home_cache_dir(): | |||
| dir = os.path.join(os.path.expanduser('~'), ".ragflow") | |||
| try: | |||
| os.mkdir(dir) | |||
| except OSError as error: | |||
| pass | |||
| return dir | |||
| @cached(cache=LRUCache(maxsize=10)) | |||
| def load_json_conf(conf_path): | |||
| if os.path.isabs(conf_path): | |||
| json_conf_path = conf_path | |||
| else: | |||
| json_conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(json_conf_path) as f: | |||
| return json.load(f) | |||
| except BaseException: | |||
| raise EnvironmentError( | |||
| "loading json file config from '{}' failed!".format(json_conf_path) | |||
| ) | |||
| def dump_json_conf(config_data, conf_path): | |||
| if os.path.isabs(conf_path): | |||
| json_conf_path = conf_path | |||
| else: | |||
| json_conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(json_conf_path, "w") as f: | |||
| json.dump(config_data, f, indent=4) | |||
| except BaseException: | |||
| raise EnvironmentError( | |||
| "loading json file config from '{}' failed!".format(json_conf_path) | |||
| ) | |||
| def load_json_conf_real_time(conf_path): | |||
| if os.path.isabs(conf_path): | |||
| json_conf_path = conf_path | |||
| else: | |||
| json_conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(json_conf_path) as f: | |||
| return json.load(f) | |||
| except BaseException: | |||
| raise EnvironmentError( | |||
| "loading json file config from '{}' failed!".format(json_conf_path) | |||
| ) | |||
| def load_yaml_conf(conf_path): | |||
| if not os.path.isabs(conf_path): | |||
| conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(conf_path) as f: | |||
| yaml = YAML(typ='safe', pure=True) | |||
| return yaml.load(f) | |||
| except Exception as e: | |||
| raise EnvironmentError( | |||
| "loading yaml file config from {} failed:".format(conf_path), e | |||
| ) | |||
| def rewrite_yaml_conf(conf_path, config): | |||
| if not os.path.isabs(conf_path): | |||
| conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(conf_path, "w") as f: | |||
| yaml = YAML(typ="safe") | |||
| yaml.dump(config, f) | |||
| except Exception as e: | |||
| raise EnvironmentError( | |||
| "rewrite yaml file config {} failed:".format(conf_path), e | |||
| ) | |||
| def rewrite_json_file(filepath, json_data): | |||
| with open(filepath, "w") as f: | |||
| json.dump(json_data, f, indent=4, separators=(",", ": ")) | |||
| f.close() | |||
| def filename_type(filename): | |||
| filename = filename.lower() | |||
| if re.match(r".*\.pdf$", filename): | |||
| return FileType.PDF.value | |||
| if re.match( | |||
| r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename): | |||
| return FileType.DOC.value | |||
| if re.match( | |||
| r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): | |||
| return FileType.AURAL.value | |||
| if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename): | |||
| return FileType.VISUAL.value | |||
| return FileType.OTHER.value | |||
| def thumbnail(filename, blob): | |||
| filename = filename.lower() | |||
| if re.match(r".*\.pdf$", filename): | |||
| pdf = pdfplumber.open(BytesIO(blob)) | |||
| buffered = BytesIO() | |||
| pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png") | |||
| return "data:image/png;base64," + \ | |||
| base64.b64encode(buffered.getvalue()).decode("utf-8") | |||
| if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): | |||
| image = Image.open(BytesIO(blob)) | |||
| image.thumbnail((30, 30)) | |||
| buffered = BytesIO() | |||
| image.save(buffered, format="png") | |||
| return "data:image/png;base64," + \ | |||
| base64.b64encode(buffered.getvalue()).decode("utf-8") | |||
| if re.match(r".*\.(ppt|pptx)$", filename): | |||
| import aspose.slides as slides | |||
| import aspose.pydrawing as drawing | |||
| try: | |||
| with slides.Presentation(BytesIO(blob)) as presentation: | |||
| buffered = BytesIO() | |||
| presentation.slides[0].get_thumbnail(0.03, 0.03).save( | |||
| buffered, drawing.imaging.ImageFormat.png) | |||
| return "data:image/png;base64," + \ | |||
| base64.b64encode(buffered.getvalue()).decode("utf-8") | |||
| except Exception as e: | |||
| pass | |||
| def traversal_files(base): | |||
| for root, ds, fs in os.walk(base): | |||
| for f in fs: | |||
| fullname = os.path.join(root, f) | |||
| yield fullname | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import base64 | |||
| import json | |||
| import os | |||
| import re | |||
| from io import BytesIO | |||
| import pdfplumber | |||
| from PIL import Image | |||
| from cachetools import LRUCache, cached | |||
| from ruamel.yaml import YAML | |||
| from api.db import FileType | |||
| PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE") | |||
| RAG_BASE = os.getenv("RAG_BASE") | |||
| def get_project_base_directory(*args): | |||
| global PROJECT_BASE | |||
| if PROJECT_BASE is None: | |||
| PROJECT_BASE = os.path.abspath( | |||
| os.path.join( | |||
| os.path.dirname(os.path.realpath(__file__)), | |||
| os.pardir, | |||
| os.pardir, | |||
| ) | |||
| ) | |||
| if args: | |||
| return os.path.join(PROJECT_BASE, *args) | |||
| return PROJECT_BASE | |||
| def get_rag_directory(*args): | |||
| global RAG_BASE | |||
| if RAG_BASE is None: | |||
| RAG_BASE = os.path.abspath( | |||
| os.path.join( | |||
| os.path.dirname(os.path.realpath(__file__)), | |||
| os.pardir, | |||
| os.pardir, | |||
| os.pardir, | |||
| ) | |||
| ) | |||
| if args: | |||
| return os.path.join(RAG_BASE, *args) | |||
| return RAG_BASE | |||
| def get_rag_python_directory(*args): | |||
| return get_rag_directory("python", *args) | |||
| def get_home_cache_dir(): | |||
| dir = os.path.join(os.path.expanduser('~'), ".ragflow") | |||
| try: | |||
| os.mkdir(dir) | |||
| except OSError as error: | |||
| pass | |||
| return dir | |||
| @cached(cache=LRUCache(maxsize=10)) | |||
| def load_json_conf(conf_path): | |||
| if os.path.isabs(conf_path): | |||
| json_conf_path = conf_path | |||
| else: | |||
| json_conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(json_conf_path) as f: | |||
| return json.load(f) | |||
| except BaseException: | |||
| raise EnvironmentError( | |||
| "loading json file config from '{}' failed!".format(json_conf_path) | |||
| ) | |||
| def dump_json_conf(config_data, conf_path): | |||
| if os.path.isabs(conf_path): | |||
| json_conf_path = conf_path | |||
| else: | |||
| json_conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(json_conf_path, "w") as f: | |||
| json.dump(config_data, f, indent=4) | |||
| except BaseException: | |||
| raise EnvironmentError( | |||
| "loading json file config from '{}' failed!".format(json_conf_path) | |||
| ) | |||
| def load_json_conf_real_time(conf_path): | |||
| if os.path.isabs(conf_path): | |||
| json_conf_path = conf_path | |||
| else: | |||
| json_conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(json_conf_path) as f: | |||
| return json.load(f) | |||
| except BaseException: | |||
| raise EnvironmentError( | |||
| "loading json file config from '{}' failed!".format(json_conf_path) | |||
| ) | |||
| def load_yaml_conf(conf_path): | |||
| if not os.path.isabs(conf_path): | |||
| conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(conf_path) as f: | |||
| yaml = YAML(typ='safe', pure=True) | |||
| return yaml.load(f) | |||
| except Exception as e: | |||
| raise EnvironmentError( | |||
| "loading yaml file config from {} failed:".format(conf_path), e | |||
| ) | |||
| def rewrite_yaml_conf(conf_path, config): | |||
| if not os.path.isabs(conf_path): | |||
| conf_path = os.path.join(get_project_base_directory(), conf_path) | |||
| try: | |||
| with open(conf_path, "w") as f: | |||
| yaml = YAML(typ="safe") | |||
| yaml.dump(config, f) | |||
| except Exception as e: | |||
| raise EnvironmentError( | |||
| "rewrite yaml file config {} failed:".format(conf_path), e | |||
| ) | |||
| def rewrite_json_file(filepath, json_data): | |||
| with open(filepath, "w") as f: | |||
| json.dump(json_data, f, indent=4, separators=(",", ": ")) | |||
| f.close() | |||
| def filename_type(filename): | |||
| filename = filename.lower() | |||
| if re.match(r".*\.pdf$", filename): | |||
| return FileType.PDF.value | |||
| if re.match( | |||
| r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename): | |||
| return FileType.DOC.value | |||
| if re.match( | |||
| r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): | |||
| return FileType.AURAL.value | |||
| if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename): | |||
| return FileType.VISUAL.value | |||
| return FileType.OTHER.value | |||
| def thumbnail(filename, blob): | |||
| filename = filename.lower() | |||
| if re.match(r".*\.pdf$", filename): | |||
| pdf = pdfplumber.open(BytesIO(blob)) | |||
| buffered = BytesIO() | |||
| pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png") | |||
| return "data:image/png;base64," + \ | |||
| base64.b64encode(buffered.getvalue()).decode("utf-8") | |||
| if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): | |||
| image = Image.open(BytesIO(blob)) | |||
| image.thumbnail((30, 30)) | |||
| buffered = BytesIO() | |||
| image.save(buffered, format="png") | |||
| return "data:image/png;base64," + \ | |||
| base64.b64encode(buffered.getvalue()).decode("utf-8") | |||
| if re.match(r".*\.(ppt|pptx)$", filename): | |||
| import aspose.slides as slides | |||
| import aspose.pydrawing as drawing | |||
| try: | |||
| with slides.Presentation(BytesIO(blob)) as presentation: | |||
| buffered = BytesIO() | |||
| presentation.slides[0].get_thumbnail(0.03, 0.03).save( | |||
| buffered, drawing.imaging.ImageFormat.png) | |||
| return "data:image/png;base64," + \ | |||
| base64.b64encode(buffered.getvalue()).decode("utf-8") | |||
| except Exception as e: | |||
| pass | |||
| def traversal_files(base): | |||
| for root, ds, fs in os.walk(base): | |||
| for f in fs: | |||
| fullname = os.path.join(root, f) | |||
| yield fullname | |||
| @@ -1,313 +1,313 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import typing | |||
| import traceback | |||
| import logging | |||
| import inspect | |||
| from logging.handlers import TimedRotatingFileHandler | |||
| from threading import RLock | |||
| from api.utils import file_utils | |||
| class LoggerFactory(object): | |||
| TYPE = "FILE" | |||
| LOG_FORMAT = "[%(levelname)s] [%(asctime)s] [%(module)s.%(funcName)s] [line:%(lineno)d]: %(message)s" | |||
| logging.basicConfig(format=LOG_FORMAT) | |||
| LEVEL = logging.DEBUG | |||
| logger_dict = {} | |||
| global_handler_dict = {} | |||
| LOG_DIR = None | |||
| PARENT_LOG_DIR = None | |||
| log_share = True | |||
| append_to_parent_log = None | |||
| lock = RLock() | |||
| # CRITICAL = 50 | |||
| # FATAL = CRITICAL | |||
| # ERROR = 40 | |||
| # WARNING = 30 | |||
| # WARN = WARNING | |||
| # INFO = 20 | |||
| # DEBUG = 10 | |||
| # NOTSET = 0 | |||
| levels = (10, 20, 30, 40) | |||
| schedule_logger_dict = {} | |||
| @staticmethod | |||
| def set_directory(directory=None, parent_log_dir=None, | |||
| append_to_parent_log=None, force=False): | |||
| if parent_log_dir: | |||
| LoggerFactory.PARENT_LOG_DIR = parent_log_dir | |||
| if append_to_parent_log: | |||
| LoggerFactory.append_to_parent_log = append_to_parent_log | |||
| with LoggerFactory.lock: | |||
| if not directory: | |||
| directory = file_utils.get_project_base_directory("logs") | |||
| if not LoggerFactory.LOG_DIR or force: | |||
| LoggerFactory.LOG_DIR = directory | |||
| if LoggerFactory.log_share: | |||
| oldmask = os.umask(000) | |||
| os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True) | |||
| os.umask(oldmask) | |||
| else: | |||
| os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True) | |||
| for loggerName, ghandler in LoggerFactory.global_handler_dict.items(): | |||
| for className, (logger, | |||
| handler) in LoggerFactory.logger_dict.items(): | |||
| logger.removeHandler(ghandler) | |||
| ghandler.close() | |||
| LoggerFactory.global_handler_dict = {} | |||
| for className, (logger, | |||
| handler) in LoggerFactory.logger_dict.items(): | |||
| logger.removeHandler(handler) | |||
| _handler = None | |||
| if handler: | |||
| handler.close() | |||
| if className != "default": | |||
| _handler = LoggerFactory.get_handler(className) | |||
| logger.addHandler(_handler) | |||
| LoggerFactory.assemble_global_handler(logger) | |||
| LoggerFactory.logger_dict[className] = logger, _handler | |||
| @staticmethod | |||
| def new_logger(name): | |||
| logger = logging.getLogger(name) | |||
| logger.propagate = False | |||
| logger.setLevel(LoggerFactory.LEVEL) | |||
| return logger | |||
| @staticmethod | |||
| def get_logger(class_name=None): | |||
| with LoggerFactory.lock: | |||
| if class_name in LoggerFactory.logger_dict.keys(): | |||
| logger, handler = LoggerFactory.logger_dict[class_name] | |||
| if not logger: | |||
| logger, handler = LoggerFactory.init_logger(class_name) | |||
| else: | |||
| logger, handler = LoggerFactory.init_logger(class_name) | |||
| return logger | |||
| @staticmethod | |||
| def get_global_handler(logger_name, level=None, log_dir=None): | |||
| if not LoggerFactory.LOG_DIR: | |||
| return logging.StreamHandler() | |||
| if log_dir: | |||
| logger_name_key = logger_name + "_" + log_dir | |||
| else: | |||
| logger_name_key = logger_name + "_" + LoggerFactory.LOG_DIR | |||
| # if loggerName not in LoggerFactory.globalHandlerDict: | |||
| if logger_name_key not in LoggerFactory.global_handler_dict: | |||
| with LoggerFactory.lock: | |||
| if logger_name_key not in LoggerFactory.global_handler_dict: | |||
| handler = LoggerFactory.get_handler( | |||
| logger_name, level, log_dir) | |||
| LoggerFactory.global_handler_dict[logger_name_key] = handler | |||
| return LoggerFactory.global_handler_dict[logger_name_key] | |||
| @staticmethod | |||
| def get_handler(class_name, level=None, log_dir=None, | |||
| log_type=None, job_id=None): | |||
| if not log_type: | |||
| if not LoggerFactory.LOG_DIR or not class_name: | |||
| return logging.StreamHandler() | |||
| # return Diy_StreamHandler() | |||
| if not log_dir: | |||
| log_file = os.path.join( | |||
| LoggerFactory.LOG_DIR, | |||
| "{}.log".format(class_name)) | |||
| else: | |||
| log_file = os.path.join(log_dir, "{}.log".format(class_name)) | |||
| else: | |||
| log_file = os.path.join(log_dir, "rag_flow_{}.log".format( | |||
| log_type) if level == LoggerFactory.LEVEL else 'rag_flow_{}_error.log'.format(log_type)) | |||
| os.makedirs(os.path.dirname(log_file), exist_ok=True) | |||
| if LoggerFactory.log_share: | |||
| handler = ROpenHandler(log_file, | |||
| when='D', | |||
| interval=1, | |||
| backupCount=14, | |||
| delay=True) | |||
| else: | |||
| handler = TimedRotatingFileHandler(log_file, | |||
| when='D', | |||
| interval=1, | |||
| backupCount=14, | |||
| delay=True) | |||
| if level: | |||
| handler.level = level | |||
| return handler | |||
| @staticmethod | |||
| def init_logger(class_name): | |||
| with LoggerFactory.lock: | |||
| logger = LoggerFactory.new_logger(class_name) | |||
| handler = None | |||
| if class_name: | |||
| handler = LoggerFactory.get_handler(class_name) | |||
| logger.addHandler(handler) | |||
| LoggerFactory.logger_dict[class_name] = logger, handler | |||
| else: | |||
| LoggerFactory.logger_dict["default"] = logger, handler | |||
| LoggerFactory.assemble_global_handler(logger) | |||
| return logger, handler | |||
| @staticmethod | |||
| def assemble_global_handler(logger): | |||
| if LoggerFactory.LOG_DIR: | |||
| for level in LoggerFactory.levels: | |||
| if level >= LoggerFactory.LEVEL: | |||
| level_logger_name = logging._levelToName[level] | |||
| logger.addHandler( | |||
| LoggerFactory.get_global_handler( | |||
| level_logger_name, level)) | |||
| if LoggerFactory.append_to_parent_log and LoggerFactory.PARENT_LOG_DIR: | |||
| for level in LoggerFactory.levels: | |||
| if level >= LoggerFactory.LEVEL: | |||
| level_logger_name = logging._levelToName[level] | |||
| logger.addHandler( | |||
| LoggerFactory.get_global_handler(level_logger_name, level, LoggerFactory.PARENT_LOG_DIR)) | |||
| def setDirectory(directory=None): | |||
| LoggerFactory.set_directory(directory) | |||
| def setLevel(level): | |||
| LoggerFactory.LEVEL = level | |||
| def getLogger(className=None, useLevelFile=False): | |||
| if className is None: | |||
| frame = inspect.stack()[1] | |||
| module = inspect.getmodule(frame[0]) | |||
| className = 'stat' | |||
| return LoggerFactory.get_logger(className) | |||
| def exception_to_trace_string(ex): | |||
| return "".join(traceback.TracebackException.from_exception(ex).format()) | |||
| class ROpenHandler(TimedRotatingFileHandler): | |||
| def _open(self): | |||
| prevumask = os.umask(000) | |||
| rtv = TimedRotatingFileHandler._open(self) | |||
| os.umask(prevumask) | |||
| return rtv | |||
| def sql_logger(job_id='', log_type='sql'): | |||
| key = job_id + log_type | |||
| if key in LoggerFactory.schedule_logger_dict.keys(): | |||
| return LoggerFactory.schedule_logger_dict[key] | |||
| return get_job_logger(job_id=job_id, log_type=log_type) | |||
| def ready_log(msg, job=None, task=None, role=None, party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}{msg} ready{suffix}" | |||
| def start_log(msg, job=None, task=None, role=None, party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}start to {msg}{suffix}" | |||
| def successful_log(msg, job=None, task=None, role=None, | |||
| party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}{msg} successfully{suffix}" | |||
| def warning_log(msg, job=None, task=None, role=None, | |||
| party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}{msg} is not effective{suffix}" | |||
| def failed_log(msg, job=None, task=None, role=None, | |||
| party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}failed to {msg}{suffix}" | |||
| def base_msg(job=None, task=None, role: str = None, | |||
| party_id: typing.Union[str, int] = None, detail=None): | |||
| if detail: | |||
| detail_msg = f" detail: \n{detail}" | |||
| else: | |||
| detail_msg = "" | |||
| if task is not None: | |||
| return f"task {task.f_task_id} {task.f_task_version} ", f" on {task.f_role} {task.f_party_id}{detail_msg}" | |||
| elif job is not None: | |||
| return "", f" on {job.f_role} {job.f_party_id}{detail_msg}" | |||
| elif role and party_id: | |||
| return "", f" on {role} {party_id}{detail_msg}" | |||
| else: | |||
| return "", f"{detail_msg}" | |||
| def exception_to_trace_string(ex): | |||
| return "".join(traceback.TracebackException.from_exception(ex).format()) | |||
| def get_logger_base_dir(): | |||
| job_log_dir = file_utils.get_rag_flow_directory('logs') | |||
| return job_log_dir | |||
| def get_job_logger(job_id, log_type): | |||
| rag_flow_log_dir = file_utils.get_rag_flow_directory('logs', 'rag_flow') | |||
| job_log_dir = file_utils.get_rag_flow_directory('logs', job_id) | |||
| if not job_id: | |||
| log_dirs = [rag_flow_log_dir] | |||
| else: | |||
| if log_type == 'audit': | |||
| log_dirs = [job_log_dir, rag_flow_log_dir] | |||
| else: | |||
| log_dirs = [job_log_dir] | |||
| if LoggerFactory.log_share: | |||
| oldmask = os.umask(000) | |||
| os.makedirs(job_log_dir, exist_ok=True) | |||
| os.makedirs(rag_flow_log_dir, exist_ok=True) | |||
| os.umask(oldmask) | |||
| else: | |||
| os.makedirs(job_log_dir, exist_ok=True) | |||
| os.makedirs(rag_flow_log_dir, exist_ok=True) | |||
| logger = LoggerFactory.new_logger(f"{job_id}_{log_type}") | |||
| for job_log_dir in log_dirs: | |||
| handler = LoggerFactory.get_handler(class_name=None, level=LoggerFactory.LEVEL, | |||
| log_dir=job_log_dir, log_type=log_type, job_id=job_id) | |||
| error_handler = LoggerFactory.get_handler( | |||
| class_name=None, | |||
| level=logging.ERROR, | |||
| log_dir=job_log_dir, | |||
| log_type=log_type, | |||
| job_id=job_id) | |||
| logger.addHandler(handler) | |||
| logger.addHandler(error_handler) | |||
| with LoggerFactory.lock: | |||
| LoggerFactory.schedule_logger_dict[job_id + log_type] = logger | |||
| return logger | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import typing | |||
| import traceback | |||
| import logging | |||
| import inspect | |||
| from logging.handlers import TimedRotatingFileHandler | |||
| from threading import RLock | |||
| from api.utils import file_utils | |||
| class LoggerFactory(object): | |||
| TYPE = "FILE" | |||
| LOG_FORMAT = "[%(levelname)s] [%(asctime)s] [%(module)s.%(funcName)s] [line:%(lineno)d]: %(message)s" | |||
| logging.basicConfig(format=LOG_FORMAT) | |||
| LEVEL = logging.DEBUG | |||
| logger_dict = {} | |||
| global_handler_dict = {} | |||
| LOG_DIR = None | |||
| PARENT_LOG_DIR = None | |||
| log_share = True | |||
| append_to_parent_log = None | |||
| lock = RLock() | |||
| # CRITICAL = 50 | |||
| # FATAL = CRITICAL | |||
| # ERROR = 40 | |||
| # WARNING = 30 | |||
| # WARN = WARNING | |||
| # INFO = 20 | |||
| # DEBUG = 10 | |||
| # NOTSET = 0 | |||
| levels = (10, 20, 30, 40) | |||
| schedule_logger_dict = {} | |||
| @staticmethod | |||
| def set_directory(directory=None, parent_log_dir=None, | |||
| append_to_parent_log=None, force=False): | |||
| if parent_log_dir: | |||
| LoggerFactory.PARENT_LOG_DIR = parent_log_dir | |||
| if append_to_parent_log: | |||
| LoggerFactory.append_to_parent_log = append_to_parent_log | |||
| with LoggerFactory.lock: | |||
| if not directory: | |||
| directory = file_utils.get_project_base_directory("logs") | |||
| if not LoggerFactory.LOG_DIR or force: | |||
| LoggerFactory.LOG_DIR = directory | |||
| if LoggerFactory.log_share: | |||
| oldmask = os.umask(000) | |||
| os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True) | |||
| os.umask(oldmask) | |||
| else: | |||
| os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True) | |||
| for loggerName, ghandler in LoggerFactory.global_handler_dict.items(): | |||
| for className, (logger, | |||
| handler) in LoggerFactory.logger_dict.items(): | |||
| logger.removeHandler(ghandler) | |||
| ghandler.close() | |||
| LoggerFactory.global_handler_dict = {} | |||
| for className, (logger, | |||
| handler) in LoggerFactory.logger_dict.items(): | |||
| logger.removeHandler(handler) | |||
| _handler = None | |||
| if handler: | |||
| handler.close() | |||
| if className != "default": | |||
| _handler = LoggerFactory.get_handler(className) | |||
| logger.addHandler(_handler) | |||
| LoggerFactory.assemble_global_handler(logger) | |||
| LoggerFactory.logger_dict[className] = logger, _handler | |||
| @staticmethod | |||
| def new_logger(name): | |||
| logger = logging.getLogger(name) | |||
| logger.propagate = False | |||
| logger.setLevel(LoggerFactory.LEVEL) | |||
| return logger | |||
| @staticmethod | |||
| def get_logger(class_name=None): | |||
| with LoggerFactory.lock: | |||
| if class_name in LoggerFactory.logger_dict.keys(): | |||
| logger, handler = LoggerFactory.logger_dict[class_name] | |||
| if not logger: | |||
| logger, handler = LoggerFactory.init_logger(class_name) | |||
| else: | |||
| logger, handler = LoggerFactory.init_logger(class_name) | |||
| return logger | |||
| @staticmethod | |||
| def get_global_handler(logger_name, level=None, log_dir=None): | |||
| if not LoggerFactory.LOG_DIR: | |||
| return logging.StreamHandler() | |||
| if log_dir: | |||
| logger_name_key = logger_name + "_" + log_dir | |||
| else: | |||
| logger_name_key = logger_name + "_" + LoggerFactory.LOG_DIR | |||
| # if loggerName not in LoggerFactory.globalHandlerDict: | |||
| if logger_name_key not in LoggerFactory.global_handler_dict: | |||
| with LoggerFactory.lock: | |||
| if logger_name_key not in LoggerFactory.global_handler_dict: | |||
| handler = LoggerFactory.get_handler( | |||
| logger_name, level, log_dir) | |||
| LoggerFactory.global_handler_dict[logger_name_key] = handler | |||
| return LoggerFactory.global_handler_dict[logger_name_key] | |||
| @staticmethod | |||
| def get_handler(class_name, level=None, log_dir=None, | |||
| log_type=None, job_id=None): | |||
| if not log_type: | |||
| if not LoggerFactory.LOG_DIR or not class_name: | |||
| return logging.StreamHandler() | |||
| # return Diy_StreamHandler() | |||
| if not log_dir: | |||
| log_file = os.path.join( | |||
| LoggerFactory.LOG_DIR, | |||
| "{}.log".format(class_name)) | |||
| else: | |||
| log_file = os.path.join(log_dir, "{}.log".format(class_name)) | |||
| else: | |||
| log_file = os.path.join(log_dir, "rag_flow_{}.log".format( | |||
| log_type) if level == LoggerFactory.LEVEL else 'rag_flow_{}_error.log'.format(log_type)) | |||
| os.makedirs(os.path.dirname(log_file), exist_ok=True) | |||
| if LoggerFactory.log_share: | |||
| handler = ROpenHandler(log_file, | |||
| when='D', | |||
| interval=1, | |||
| backupCount=14, | |||
| delay=True) | |||
| else: | |||
| handler = TimedRotatingFileHandler(log_file, | |||
| when='D', | |||
| interval=1, | |||
| backupCount=14, | |||
| delay=True) | |||
| if level: | |||
| handler.level = level | |||
| return handler | |||
| @staticmethod | |||
| def init_logger(class_name): | |||
| with LoggerFactory.lock: | |||
| logger = LoggerFactory.new_logger(class_name) | |||
| handler = None | |||
| if class_name: | |||
| handler = LoggerFactory.get_handler(class_name) | |||
| logger.addHandler(handler) | |||
| LoggerFactory.logger_dict[class_name] = logger, handler | |||
| else: | |||
| LoggerFactory.logger_dict["default"] = logger, handler | |||
| LoggerFactory.assemble_global_handler(logger) | |||
| return logger, handler | |||
| @staticmethod | |||
| def assemble_global_handler(logger): | |||
| if LoggerFactory.LOG_DIR: | |||
| for level in LoggerFactory.levels: | |||
| if level >= LoggerFactory.LEVEL: | |||
| level_logger_name = logging._levelToName[level] | |||
| logger.addHandler( | |||
| LoggerFactory.get_global_handler( | |||
| level_logger_name, level)) | |||
| if LoggerFactory.append_to_parent_log and LoggerFactory.PARENT_LOG_DIR: | |||
| for level in LoggerFactory.levels: | |||
| if level >= LoggerFactory.LEVEL: | |||
| level_logger_name = logging._levelToName[level] | |||
| logger.addHandler( | |||
| LoggerFactory.get_global_handler(level_logger_name, level, LoggerFactory.PARENT_LOG_DIR)) | |||
| def setDirectory(directory=None): | |||
| LoggerFactory.set_directory(directory) | |||
| def setLevel(level): | |||
| LoggerFactory.LEVEL = level | |||
| def getLogger(className=None, useLevelFile=False): | |||
| if className is None: | |||
| frame = inspect.stack()[1] | |||
| module = inspect.getmodule(frame[0]) | |||
| className = 'stat' | |||
| return LoggerFactory.get_logger(className) | |||
| def exception_to_trace_string(ex): | |||
| return "".join(traceback.TracebackException.from_exception(ex).format()) | |||
| class ROpenHandler(TimedRotatingFileHandler): | |||
| def _open(self): | |||
| prevumask = os.umask(000) | |||
| rtv = TimedRotatingFileHandler._open(self) | |||
| os.umask(prevumask) | |||
| return rtv | |||
| def sql_logger(job_id='', log_type='sql'): | |||
| key = job_id + log_type | |||
| if key in LoggerFactory.schedule_logger_dict.keys(): | |||
| return LoggerFactory.schedule_logger_dict[key] | |||
| return get_job_logger(job_id=job_id, log_type=log_type) | |||
| def ready_log(msg, job=None, task=None, role=None, party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}{msg} ready{suffix}" | |||
| def start_log(msg, job=None, task=None, role=None, party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}start to {msg}{suffix}" | |||
| def successful_log(msg, job=None, task=None, role=None, | |||
| party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}{msg} successfully{suffix}" | |||
| def warning_log(msg, job=None, task=None, role=None, | |||
| party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}{msg} is not effective{suffix}" | |||
| def failed_log(msg, job=None, task=None, role=None, | |||
| party_id=None, detail=None): | |||
| prefix, suffix = base_msg(job, task, role, party_id, detail) | |||
| return f"{prefix}failed to {msg}{suffix}" | |||
| def base_msg(job=None, task=None, role: str = None, | |||
| party_id: typing.Union[str, int] = None, detail=None): | |||
| if detail: | |||
| detail_msg = f" detail: \n{detail}" | |||
| else: | |||
| detail_msg = "" | |||
| if task is not None: | |||
| return f"task {task.f_task_id} {task.f_task_version} ", f" on {task.f_role} {task.f_party_id}{detail_msg}" | |||
| elif job is not None: | |||
| return "", f" on {job.f_role} {job.f_party_id}{detail_msg}" | |||
| elif role and party_id: | |||
| return "", f" on {role} {party_id}{detail_msg}" | |||
| else: | |||
| return "", f"{detail_msg}" | |||
| def exception_to_trace_string(ex): | |||
| return "".join(traceback.TracebackException.from_exception(ex).format()) | |||
| def get_logger_base_dir(): | |||
| job_log_dir = file_utils.get_rag_flow_directory('logs') | |||
| return job_log_dir | |||
| def get_job_logger(job_id, log_type): | |||
| rag_flow_log_dir = file_utils.get_rag_flow_directory('logs', 'rag_flow') | |||
| job_log_dir = file_utils.get_rag_flow_directory('logs', job_id) | |||
| if not job_id: | |||
| log_dirs = [rag_flow_log_dir] | |||
| else: | |||
| if log_type == 'audit': | |||
| log_dirs = [job_log_dir, rag_flow_log_dir] | |||
| else: | |||
| log_dirs = [job_log_dir] | |||
| if LoggerFactory.log_share: | |||
| oldmask = os.umask(000) | |||
| os.makedirs(job_log_dir, exist_ok=True) | |||
| os.makedirs(rag_flow_log_dir, exist_ok=True) | |||
| os.umask(oldmask) | |||
| else: | |||
| os.makedirs(job_log_dir, exist_ok=True) | |||
| os.makedirs(rag_flow_log_dir, exist_ok=True) | |||
| logger = LoggerFactory.new_logger(f"{job_id}_{log_type}") | |||
| for job_log_dir in log_dirs: | |||
| handler = LoggerFactory.get_handler(class_name=None, level=LoggerFactory.LEVEL, | |||
| log_dir=job_log_dir, log_type=log_type, job_id=job_id) | |||
| error_handler = LoggerFactory.get_handler( | |||
| class_name=None, | |||
| level=logging.ERROR, | |||
| log_dir=job_log_dir, | |||
| log_type=log_type, | |||
| job_id=job_id) | |||
| logger.addHandler(handler) | |||
| logger.addHandler(error_handler) | |||
| with LoggerFactory.lock: | |||
| LoggerFactory.schedule_logger_dict[job_id + log_type] = logger | |||
| return logger | |||
| @@ -1,24 +1,24 @@ | |||
| import base64 | |||
| import os | |||
| import sys | |||
| from Cryptodome.PublicKey import RSA | |||
| from Cryptodome.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5 | |||
| from api.utils import decrypt, file_utils | |||
| def crypt(line): | |||
| file_path = os.path.join( | |||
| file_utils.get_project_base_directory(), | |||
| "conf", | |||
| "public.pem") | |||
| rsa_key = RSA.importKey(open(file_path).read(),"Welcome") | |||
| cipher = Cipher_pkcs1_v1_5.new(rsa_key) | |||
| password_base64 = base64.b64encode(line.encode('utf-8')).decode("utf-8") | |||
| encrypted_password = cipher.encrypt(password_base64.encode()) | |||
| return base64.b64encode(encrypted_password).decode('utf-8') | |||
| if __name__ == "__main__": | |||
| pswd = crypt(sys.argv[1]) | |||
| print(pswd) | |||
| print(decrypt(pswd)) | |||
| import base64 | |||
| import os | |||
| import sys | |||
| from Cryptodome.PublicKey import RSA | |||
| from Cryptodome.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5 | |||
| from api.utils import decrypt, file_utils | |||
| def crypt(line): | |||
| file_path = os.path.join( | |||
| file_utils.get_project_base_directory(), | |||
| "conf", | |||
| "public.pem") | |||
| rsa_key = RSA.importKey(open(file_path).read(),"Welcome") | |||
| cipher = Cipher_pkcs1_v1_5.new(rsa_key) | |||
| password_base64 = base64.b64encode(line.encode('utf-8')).decode("utf-8") | |||
| encrypted_password = cipher.encrypt(password_base64.encode()) | |||
| return base64.b64encode(encrypted_password).decode('utf-8') | |||
| if __name__ == "__main__": | |||
| pswd = crypt(sys.argv[1]) | |||
| print(pswd) | |||
| print(decrypt(pswd)) | |||
| @@ -1,28 +1,28 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import dotenv | |||
| import typing | |||
| from api.utils.file_utils import get_project_base_directory | |||
| def get_versions() -> typing.Mapping[str, typing.Any]: | |||
| dotenv.load_dotenv(dotenv.find_dotenv()) | |||
| return dotenv.dotenv_values() | |||
| def get_rag_version() -> typing.Optional[str]: | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import dotenv | |||
| import typing | |||
| from api.utils.file_utils import get_project_base_directory | |||
| def get_versions() -> typing.Mapping[str, typing.Any]: | |||
| dotenv.load_dotenv(dotenv.find_dotenv()) | |||
| return dotenv.dotenv_values() | |||
| def get_rag_version() -> typing.Optional[str]: | |||
| return get_versions().get("RAGFLOW_VERSION", "dev") | |||
| @@ -1,49 +1,49 @@ | |||
| ragflow: | |||
| host: 0.0.0.0 | |||
| http_port: 9380 | |||
| mysql: | |||
| name: 'rag_flow' | |||
| user: 'root' | |||
| password: 'infini_rag_flow' | |||
| host: 'mysql' | |||
| port: 3306 | |||
| max_connections: 100 | |||
| stale_timeout: 30 | |||
| minio: | |||
| user: 'rag_flow' | |||
| password: 'infini_rag_flow' | |||
| host: 'minio:9000' | |||
| es: | |||
| hosts: 'http://es01:9200' | |||
| username: 'elastic' | |||
| password: 'infini_rag_flow' | |||
| redis: | |||
| db: 1 | |||
| password: 'infini_rag_flow' | |||
| host: 'redis:6379' | |||
| user_default_llm: | |||
| factory: 'Tongyi-Qianwen' | |||
| api_key: 'sk-xxxxxxxxxxxxx' | |||
| base_url: '' | |||
| oauth: | |||
| github: | |||
| client_id: xxxxxxxxxxxxxxxxxxxxxxxxx | |||
| secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |||
| url: https://github.com/login/oauth/access_token | |||
| feishu: | |||
| app_id: cli_xxxxxxxxxxxxxxxxxxx | |||
| app_secret: xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |||
| app_access_token_url: https://open.feishu.cn/open-apis/auth/v3/app_access_token/internal | |||
| user_access_token_url: https://open.feishu.cn/open-apis/authen/v1/oidc/access_token | |||
| grant_type: 'authorization_code' | |||
| authentication: | |||
| client: | |||
| switch: false | |||
| http_app_key: | |||
| http_secret_key: | |||
| site: | |||
| switch: false | |||
| permission: | |||
| switch: false | |||
| component: false | |||
| dataset: false | |||
| ragflow: | |||
| host: 0.0.0.0 | |||
| http_port: 9380 | |||
| mysql: | |||
| name: 'rag_flow' | |||
| user: 'root' | |||
| password: 'infini_rag_flow' | |||
| host: 'mysql' | |||
| port: 3306 | |||
| max_connections: 100 | |||
| stale_timeout: 30 | |||
| minio: | |||
| user: 'rag_flow' | |||
| password: 'infini_rag_flow' | |||
| host: 'minio:9000' | |||
| es: | |||
| hosts: 'http://es01:9200' | |||
| username: 'elastic' | |||
| password: 'infini_rag_flow' | |||
| redis: | |||
| db: 1 | |||
| password: 'infini_rag_flow' | |||
| host: 'redis:6379' | |||
| user_default_llm: | |||
| factory: 'Tongyi-Qianwen' | |||
| api_key: 'sk-xxxxxxxxxxxxx' | |||
| base_url: '' | |||
| oauth: | |||
| github: | |||
| client_id: xxxxxxxxxxxxxxxxxxxxxxxxx | |||
| secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |||
| url: https://github.com/login/oauth/access_token | |||
| feishu: | |||
| app_id: cli_xxxxxxxxxxxxxxxxxxx | |||
| app_secret: xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |||
| app_access_token_url: https://open.feishu.cn/open-apis/auth/v3/app_access_token/internal | |||
| user_access_token_url: https://open.feishu.cn/open-apis/authen/v1/oidc/access_token | |||
| grant_type: 'authorization_code' | |||
| authentication: | |||
| client: | |||
| switch: false | |||
| http_app_key: | |||
| http_secret_key: | |||
| site: | |||
| switch: false | |||
| permission: | |||
| switch: false | |||
| component: false | |||
| dataset: false | |||
| @@ -1,122 +1,122 @@ | |||
| English | [简体中文](./README_zh.md) | |||
| # *Deep*Doc | |||
| - [1. Introduction](#1) | |||
| - [2. Vision](#2) | |||
| - [3. Parser](#3) | |||
| <a name="1"></a> | |||
| ## 1. Introduction | |||
| With a bunch of documents from various domains with various formats and along with diverse retrieval requirements, | |||
| an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose. | |||
| There are 2 parts in *Deep*Doc so far: vision and parser. | |||
| You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR. | |||
| ```bash | |||
| python deepdoc/vision/t_ocr.py -h | |||
| usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] | |||
| options: | |||
| -h, --help show this help message and exit | |||
| --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF | |||
| --output_dir OUTPUT_DIR | |||
| Directory where to store the output images. Default: './ocr_outputs' | |||
| ``` | |||
| ```bash | |||
| python deepdoc/vision/t_recognizer.py -h | |||
| usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}] | |||
| options: | |||
| -h, --help show this help message and exit | |||
| --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF | |||
| --output_dir OUTPUT_DIR | |||
| Directory where to store the output images. Default: './layouts_outputs' | |||
| --threshold THRESHOLD | |||
| A threshold to filter out detections. Default: 0.5 | |||
| --mode {layout,tsr} Task mode: layout recognition or table structure recognition | |||
| ``` | |||
| Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!! | |||
| ```bash | |||
| export HF_ENDPOINT=https://hf-mirror.com | |||
| ``` | |||
| <a name="2"></a> | |||
| ## 2. Vision | |||
| We use vision information to resolve problems as human being. | |||
| - OCR. Since a lot of documents presented as images or at least be able to transform to image, | |||
| OCR is a very essential and fundamental or even universal solution for text extraction. | |||
| ```bash | |||
| python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result | |||
| ``` | |||
| The inputs could be directory to images or PDF, or a image or PDF. | |||
| You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results, | |||
| txt files which contain the OCR text. | |||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||
| <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/> | |||
| </div> | |||
| - Layout recognition. Documents from different domain may have various layouts, | |||
| like, newspaper, magazine, book and résumé are distinct in terms of layout. | |||
| Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not, | |||
| or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption. | |||
| We have 10 basic layout components which covers most cases: | |||
| - Text | |||
| - Title | |||
| - Figure | |||
| - Figure caption | |||
| - Table | |||
| - Table caption | |||
| - Header | |||
| - Footer | |||
| - Reference | |||
| - Equation | |||
| Have a try on the following command to see the layout detection results. | |||
| ```bash | |||
| python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result | |||
| ``` | |||
| The inputs could be directory to images or PDF, or a image or PDF. | |||
| You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following: | |||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||
| <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/> | |||
| </div> | |||
| - Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text. | |||
| And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers. | |||
| Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM. | |||
| We have five labels for TSR task: | |||
| - Column | |||
| - Row | |||
| - Column header | |||
| - Projected row header | |||
| - Spanning cell | |||
| Have a try on the following command to see the layout detection results. | |||
| ```bash | |||
| python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result | |||
| ``` | |||
| The inputs could be directory to images or PDF, or a image or PDF. | |||
| You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following: | |||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||
| <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/> | |||
| </div> | |||
| <a name="3"></a> | |||
| ## 3. Parser | |||
| Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser. | |||
| The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes: | |||
| - Text chunks with their own positions in PDF(page number and rectangular positions). | |||
| - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences. | |||
| - Figures with caption and text in the figures. | |||
| ### Résumé | |||
| The résumé is a very complicated kind of document. A résumé which is composed of unstructured text | |||
| with various layouts could be resolved into structured data composed of nearly a hundred of fields. | |||
| We haven't opened the parser yet, as we open the processing method after parsing procedure. | |||
| English | [简体中文](./README_zh.md) | |||
| # *Deep*Doc | |||
| - [1. Introduction](#1) | |||
| - [2. Vision](#2) | |||
| - [3. Parser](#3) | |||
| <a name="1"></a> | |||
| ## 1. Introduction | |||
| With a bunch of documents from various domains with various formats and along with diverse retrieval requirements, | |||
| an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose. | |||
| There are 2 parts in *Deep*Doc so far: vision and parser. | |||
| You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR. | |||
| ```bash | |||
| python deepdoc/vision/t_ocr.py -h | |||
| usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] | |||
| options: | |||
| -h, --help show this help message and exit | |||
| --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF | |||
| --output_dir OUTPUT_DIR | |||
| Directory where to store the output images. Default: './ocr_outputs' | |||
| ``` | |||
| ```bash | |||
| python deepdoc/vision/t_recognizer.py -h | |||
| usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}] | |||
| options: | |||
| -h, --help show this help message and exit | |||
| --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF | |||
| --output_dir OUTPUT_DIR | |||
| Directory where to store the output images. Default: './layouts_outputs' | |||
| --threshold THRESHOLD | |||
| A threshold to filter out detections. Default: 0.5 | |||
| --mode {layout,tsr} Task mode: layout recognition or table structure recognition | |||
| ``` | |||
| Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!! | |||
| ```bash | |||
| export HF_ENDPOINT=https://hf-mirror.com | |||
| ``` | |||
| <a name="2"></a> | |||
| ## 2. Vision | |||
| We use vision information to resolve problems as human being. | |||
| - OCR. Since a lot of documents presented as images or at least be able to transform to image, | |||
| OCR is a very essential and fundamental or even universal solution for text extraction. | |||
| ```bash | |||
| python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result | |||
| ``` | |||
| The inputs could be directory to images or PDF, or a image or PDF. | |||
| You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results, | |||
| txt files which contain the OCR text. | |||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||
| <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/> | |||
| </div> | |||
| - Layout recognition. Documents from different domain may have various layouts, | |||
| like, newspaper, magazine, book and résumé are distinct in terms of layout. | |||
| Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not, | |||
| or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption. | |||
| We have 10 basic layout components which covers most cases: | |||
| - Text | |||
| - Title | |||
| - Figure | |||
| - Figure caption | |||
| - Table | |||
| - Table caption | |||
| - Header | |||
| - Footer | |||
| - Reference | |||
| - Equation | |||
| Have a try on the following command to see the layout detection results. | |||
| ```bash | |||
| python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result | |||
| ``` | |||
| The inputs could be directory to images or PDF, or a image or PDF. | |||
| You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following: | |||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||
| <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/> | |||
| </div> | |||
| - Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text. | |||
| And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers. | |||
| Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM. | |||
| We have five labels for TSR task: | |||
| - Column | |||
| - Row | |||
| - Column header | |||
| - Projected row header | |||
| - Spanning cell | |||
| Have a try on the following command to see the layout detection results. | |||
| ```bash | |||
| python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result | |||
| ``` | |||
| The inputs could be directory to images or PDF, or a image or PDF. | |||
| You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following: | |||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||
| <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/> | |||
| </div> | |||
| <a name="3"></a> | |||
| ## 3. Parser | |||
| Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser. | |||
| The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes: | |||
| - Text chunks with their own positions in PDF(page number and rectangular positions). | |||
| - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences. | |||
| - Figures with caption and text in the figures. | |||
| ### Résumé | |||
| The résumé is a very complicated kind of document. A résumé which is composed of unstructured text | |||
| with various layouts could be resolved into structured data composed of nearly a hundred of fields. | |||
| We haven't opened the parser yet, as we open the processing method after parsing procedure. | |||
| @@ -1,61 +1,61 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from io import BytesIO | |||
| from pptx import Presentation | |||
| class RAGFlowPptParser(object): | |||
| def __init__(self): | |||
| super().__init__() | |||
| def __extract(self, shape): | |||
| if shape.shape_type == 19: | |||
| tb = shape.table | |||
| rows = [] | |||
| for i in range(1, len(tb.rows)): | |||
| rows.append("; ".join([tb.cell( | |||
| 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) | |||
| return "\n".join(rows) | |||
| if shape.has_text_frame: | |||
| return shape.text_frame.text | |||
| if shape.shape_type == 6: | |||
| texts = [] | |||
| for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): | |||
| t = self.__extract(p) | |||
| if t: | |||
| texts.append(t) | |||
| return "\n".join(texts) | |||
| def __call__(self, fnm, from_page, to_page, callback=None): | |||
| ppt = Presentation(fnm) if isinstance( | |||
| fnm, str) else Presentation( | |||
| BytesIO(fnm)) | |||
| txts = [] | |||
| self.total_page = len(ppt.slides) | |||
| for i, slide in enumerate(ppt.slides): | |||
| if i < from_page: | |||
| continue | |||
| if i >= to_page: | |||
| break | |||
| texts = [] | |||
| for shape in sorted( | |||
| slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)): | |||
| txt = self.__extract(shape) | |||
| if txt: | |||
| texts.append(txt) | |||
| txts.append("\n".join(texts)) | |||
| return txts | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from io import BytesIO | |||
| from pptx import Presentation | |||
| class RAGFlowPptParser(object): | |||
| def __init__(self): | |||
| super().__init__() | |||
| def __extract(self, shape): | |||
| if shape.shape_type == 19: | |||
| tb = shape.table | |||
| rows = [] | |||
| for i in range(1, len(tb.rows)): | |||
| rows.append("; ".join([tb.cell( | |||
| 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) | |||
| return "\n".join(rows) | |||
| if shape.has_text_frame: | |||
| return shape.text_frame.text | |||
| if shape.shape_type == 6: | |||
| texts = [] | |||
| for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): | |||
| t = self.__extract(p) | |||
| if t: | |||
| texts.append(t) | |||
| return "\n".join(texts) | |||
| def __call__(self, fnm, from_page, to_page, callback=None): | |||
| ppt = Presentation(fnm) if isinstance( | |||
| fnm, str) else Presentation( | |||
| BytesIO(fnm)) | |||
| txts = [] | |||
| self.total_page = len(ppt.slides) | |||
| for i, slide in enumerate(ppt.slides): | |||
| if i < from_page: | |||
| continue | |||
| if i >= to_page: | |||
| break | |||
| texts = [] | |||
| for shape in sorted( | |||
| slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)): | |||
| txt = self.__extract(shape) | |||
| if txt: | |||
| texts.append(txt) | |||
| txts.append("\n".join(texts)) | |||
| return txts | |||
| @@ -1,65 +1,65 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import datetime | |||
| def refactor(cv): | |||
| for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]: | |||
| if n in cv and cv[n] is not None: del cv[n] | |||
| cv["is_deleted"] = 0 | |||
| if "basic" not in cv: cv["basic"] = {} | |||
| if cv["basic"].get("photo2"): del cv["basic"]["photo2"] | |||
| for n in ["education", "work", "certificate", "project", "language", "skill", "training"]: | |||
| if n not in cv or cv[n] is None: continue | |||
| if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()] | |||
| if type(cv[n]) != type([]): | |||
| del cv[n] | |||
| continue | |||
| vv = [] | |||
| for v in cv[n]: | |||
| if "external" in v and v["external"] is not None: del v["external"] | |||
| vv.append(v) | |||
| cv[n] = {str(i): vv[i] for i in range(len(vv))} | |||
| basics = [ | |||
| ("basic_salary_month", "salary_month"), | |||
| ("expect_annual_salary_from", "expect_annual_salary"), | |||
| ] | |||
| for n, t in basics: | |||
| if cv["basic"].get(n): | |||
| cv["basic"][t] = cv["basic"][n] | |||
| del cv["basic"][n] | |||
| work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", "")) | |||
| edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", "")) | |||
| if work: | |||
| cv["basic"]["work_start_time"] = work[0].get("start_time", "") | |||
| cv["basic"]["management_experience"] = 'Y' if any( | |||
| [w.get("management_experience", '') == 'Y' for w in work]) else 'N' | |||
| cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0") | |||
| for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities", | |||
| "corporation_type", "scale", "corporation_name"]: | |||
| cv["basic"][n] = work[-1].get(n, "") | |||
| if edu: | |||
| for n in ["school_name", "discipline_name"]: | |||
| if n in edu[-1]: cv["basic"][n] = edu[-1][n] | |||
| cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |||
| if "contact" not in cv: cv["contact"] = {} | |||
| if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "") | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import datetime | |||
| def refactor(cv): | |||
| for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]: | |||
| if n in cv and cv[n] is not None: del cv[n] | |||
| cv["is_deleted"] = 0 | |||
| if "basic" not in cv: cv["basic"] = {} | |||
| if cv["basic"].get("photo2"): del cv["basic"]["photo2"] | |||
| for n in ["education", "work", "certificate", "project", "language", "skill", "training"]: | |||
| if n not in cv or cv[n] is None: continue | |||
| if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()] | |||
| if type(cv[n]) != type([]): | |||
| del cv[n] | |||
| continue | |||
| vv = [] | |||
| for v in cv[n]: | |||
| if "external" in v and v["external"] is not None: del v["external"] | |||
| vv.append(v) | |||
| cv[n] = {str(i): vv[i] for i in range(len(vv))} | |||
| basics = [ | |||
| ("basic_salary_month", "salary_month"), | |||
| ("expect_annual_salary_from", "expect_annual_salary"), | |||
| ] | |||
| for n, t in basics: | |||
| if cv["basic"].get(n): | |||
| cv["basic"][t] = cv["basic"][n] | |||
| del cv["basic"][n] | |||
| work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", "")) | |||
| edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", "")) | |||
| if work: | |||
| cv["basic"]["work_start_time"] = work[0].get("start_time", "") | |||
| cv["basic"]["management_experience"] = 'Y' if any( | |||
| [w.get("management_experience", '') == 'Y' for w in work]) else 'N' | |||
| cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0") | |||
| for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities", | |||
| "corporation_type", "scale", "corporation_name"]: | |||
| cv["basic"][n] = work[-1].get(n, "") | |||
| if edu: | |||
| for n in ["school_name", "discipline_name"]: | |||
| if n in edu[-1]: cv["basic"][n] = edu[-1][n] | |||
| cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |||
| if "contact" not in cv: cv["contact"] = {} | |||
| if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "") | |||
| return cv | |||
| @@ -1,4 +1,4 @@ | |||
| 清华大学,2,985,清华 | |||
| 清华大学,2,985,清华 | |||
| 清华大学,2,985,Tsinghua University | |||
| 清华大学,2,985,THU | |||
| 北京大学,1,985,北大 | |||
| @@ -1,186 +1,186 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| from deepdoc.parser.resume.entities import degrees, regions, industries | |||
| FIELDS = [ | |||
| "address STRING", | |||
| "annual_salary int", | |||
| "annual_salary_from int", | |||
| "annual_salary_to int", | |||
| "birth STRING", | |||
| "card STRING", | |||
| "certificate_obj string", | |||
| "city STRING", | |||
| "corporation_id int", | |||
| "corporation_name STRING", | |||
| "corporation_type STRING", | |||
| "degree STRING", | |||
| "discipline_name STRING", | |||
| "education_obj string", | |||
| "email STRING", | |||
| "expect_annual_salary int", | |||
| "expect_city_names string", | |||
| "expect_industry_name STRING", | |||
| "expect_position_name STRING", | |||
| "expect_salary_from int", | |||
| "expect_salary_to int", | |||
| "expect_type STRING", | |||
| "gender STRING", | |||
| "industry_name STRING", | |||
| "industry_names STRING", | |||
| "is_deleted STRING", | |||
| "is_fertility STRING", | |||
| "is_house STRING", | |||
| "is_management_experience STRING", | |||
| "is_marital STRING", | |||
| "is_oversea STRING", | |||
| "language_obj string", | |||
| "name STRING", | |||
| "nation STRING", | |||
| "phone STRING", | |||
| "political_status STRING", | |||
| "position_name STRING", | |||
| "project_obj string", | |||
| "responsibilities string", | |||
| "salary_month int", | |||
| "scale STRING", | |||
| "school_name STRING", | |||
| "self_remark string", | |||
| "skill_obj string", | |||
| "title_name STRING", | |||
| "tob_resume_id STRING", | |||
| "updated_at Timestamp", | |||
| "wechat STRING", | |||
| "work_obj string", | |||
| "work_experience int", | |||
| "work_start_time BIGINT" | |||
| ] | |||
| def refactor(df): | |||
| def deal_obj(obj, k, kk): | |||
| if not isinstance(obj, type({})): | |||
| return "" | |||
| obj = obj.get(k, {}) | |||
| if not isinstance(obj, type({})): | |||
| return "" | |||
| return obj.get(kk, "") | |||
| def loadjson(line): | |||
| try: | |||
| return json.loads(line) | |||
| except Exception as e: | |||
| pass | |||
| return {} | |||
| df["obj"] = df["resume_content"].map(lambda x: loadjson(x)) | |||
| df.fillna("", inplace=True) | |||
| clms = ["tob_resume_id", "updated_at"] | |||
| def extract(nms, cc=None): | |||
| nonlocal clms | |||
| clms.extend(nms) | |||
| for c in nms: | |||
| if cc: | |||
| df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c)) | |||
| else: | |||
| df[c] = df["obj"].map( | |||
| lambda x: json.dumps( | |||
| x.get( | |||
| c, | |||
| {}), | |||
| ensure_ascii=False) if isinstance( | |||
| x, | |||
| type( | |||
| {})) and ( | |||
| isinstance( | |||
| x.get(c), | |||
| type( | |||
| {})) or not x.get(c)) else str(x).replace( | |||
| "None", | |||
| "")) | |||
| extract(["education", "work", "certificate", "project", "language", | |||
| "skill"]) | |||
| extract(["wechat", "phone", "is_deleted", | |||
| "name", "tel", "email"], "contact") | |||
| extract(["nation", "expect_industry_name", "salary_month", | |||
| "industry_ids", "is_house", "birth", "annual_salary_from", | |||
| "annual_salary_to", "card", | |||
| "expect_salary_to", "expect_salary_from", | |||
| "expect_position_name", "gender", "city", | |||
| "is_fertility", "expect_city_names", | |||
| "political_status", "title_name", "expect_annual_salary", | |||
| "industry_name", "address", "position_name", "school_name", | |||
| "corporation_id", | |||
| "is_oversea", "responsibilities", | |||
| "work_start_time", "degree", "management_experience", | |||
| "expect_type", "corporation_type", "scale", "corporation_name", | |||
| "self_remark", "annual_salary", "work_experience", | |||
| "discipline_name", "marital", "updated_at"], "basic") | |||
| df["degree"] = df["degree"].map(lambda x: degrees.get_name(x)) | |||
| df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x))) | |||
| df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in | |||
| str(x).split(",")])) | |||
| clms.append("industry_names") | |||
| def arr2str(a): | |||
| if not a: | |||
| return "" | |||
| if isinstance(a, list): | |||
| a = " ".join([str(i) for i in a]) | |||
| return str(a).replace(",", " ") | |||
| df["expect_industry_name"] = df["expect_industry_name"].map( | |||
| lambda x: arr2str(x)) | |||
| df["gender"] = df["gender"].map( | |||
| lambda x: "男" if x == 'M' else ( | |||
| "女" if x == 'F' else "")) | |||
| for c in ["is_fertility", "is_oversea", "is_house", | |||
| "management_experience", "marital"]: | |||
| df[c] = df[c].map( | |||
| lambda x: '是' if x == 'Y' else ( | |||
| '否' if x == 'N' else "")) | |||
| df["is_management_experience"] = df["management_experience"] | |||
| df["is_marital"] = df["marital"] | |||
| clms.extend(["is_management_experience", "is_marital"]) | |||
| df.fillna("", inplace=True) | |||
| for i in range(len(df)): | |||
| if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip(): | |||
| df.loc[i, "phone"] = df.loc[i, "tel"].strip() | |||
| for n in ["industry_ids", "management_experience", "marital", "tel"]: | |||
| for i in range(len(clms)): | |||
| if clms[i] == n: | |||
| del clms[i] | |||
| break | |||
| clms = list(set(clms)) | |||
| df = df.reindex(sorted(clms), axis=1) | |||
| #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL") | |||
| for c in clms: | |||
| df[c] = df[c].map( | |||
| lambda s: str(s).replace( | |||
| "\t", | |||
| " ").replace( | |||
| "\n", | |||
| "\\n").replace( | |||
| "\r", | |||
| "\\n")) | |||
| # print(df.values.tolist()) | |||
| return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0])) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| from deepdoc.parser.resume.entities import degrees, regions, industries | |||
| FIELDS = [ | |||
| "address STRING", | |||
| "annual_salary int", | |||
| "annual_salary_from int", | |||
| "annual_salary_to int", | |||
| "birth STRING", | |||
| "card STRING", | |||
| "certificate_obj string", | |||
| "city STRING", | |||
| "corporation_id int", | |||
| "corporation_name STRING", | |||
| "corporation_type STRING", | |||
| "degree STRING", | |||
| "discipline_name STRING", | |||
| "education_obj string", | |||
| "email STRING", | |||
| "expect_annual_salary int", | |||
| "expect_city_names string", | |||
| "expect_industry_name STRING", | |||
| "expect_position_name STRING", | |||
| "expect_salary_from int", | |||
| "expect_salary_to int", | |||
| "expect_type STRING", | |||
| "gender STRING", | |||
| "industry_name STRING", | |||
| "industry_names STRING", | |||
| "is_deleted STRING", | |||
| "is_fertility STRING", | |||
| "is_house STRING", | |||
| "is_management_experience STRING", | |||
| "is_marital STRING", | |||
| "is_oversea STRING", | |||
| "language_obj string", | |||
| "name STRING", | |||
| "nation STRING", | |||
| "phone STRING", | |||
| "political_status STRING", | |||
| "position_name STRING", | |||
| "project_obj string", | |||
| "responsibilities string", | |||
| "salary_month int", | |||
| "scale STRING", | |||
| "school_name STRING", | |||
| "self_remark string", | |||
| "skill_obj string", | |||
| "title_name STRING", | |||
| "tob_resume_id STRING", | |||
| "updated_at Timestamp", | |||
| "wechat STRING", | |||
| "work_obj string", | |||
| "work_experience int", | |||
| "work_start_time BIGINT" | |||
| ] | |||
| def refactor(df): | |||
| def deal_obj(obj, k, kk): | |||
| if not isinstance(obj, type({})): | |||
| return "" | |||
| obj = obj.get(k, {}) | |||
| if not isinstance(obj, type({})): | |||
| return "" | |||
| return obj.get(kk, "") | |||
| def loadjson(line): | |||
| try: | |||
| return json.loads(line) | |||
| except Exception as e: | |||
| pass | |||
| return {} | |||
| df["obj"] = df["resume_content"].map(lambda x: loadjson(x)) | |||
| df.fillna("", inplace=True) | |||
| clms = ["tob_resume_id", "updated_at"] | |||
| def extract(nms, cc=None): | |||
| nonlocal clms | |||
| clms.extend(nms) | |||
| for c in nms: | |||
| if cc: | |||
| df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c)) | |||
| else: | |||
| df[c] = df["obj"].map( | |||
| lambda x: json.dumps( | |||
| x.get( | |||
| c, | |||
| {}), | |||
| ensure_ascii=False) if isinstance( | |||
| x, | |||
| type( | |||
| {})) and ( | |||
| isinstance( | |||
| x.get(c), | |||
| type( | |||
| {})) or not x.get(c)) else str(x).replace( | |||
| "None", | |||
| "")) | |||
| extract(["education", "work", "certificate", "project", "language", | |||
| "skill"]) | |||
| extract(["wechat", "phone", "is_deleted", | |||
| "name", "tel", "email"], "contact") | |||
| extract(["nation", "expect_industry_name", "salary_month", | |||
| "industry_ids", "is_house", "birth", "annual_salary_from", | |||
| "annual_salary_to", "card", | |||
| "expect_salary_to", "expect_salary_from", | |||
| "expect_position_name", "gender", "city", | |||
| "is_fertility", "expect_city_names", | |||
| "political_status", "title_name", "expect_annual_salary", | |||
| "industry_name", "address", "position_name", "school_name", | |||
| "corporation_id", | |||
| "is_oversea", "responsibilities", | |||
| "work_start_time", "degree", "management_experience", | |||
| "expect_type", "corporation_type", "scale", "corporation_name", | |||
| "self_remark", "annual_salary", "work_experience", | |||
| "discipline_name", "marital", "updated_at"], "basic") | |||
| df["degree"] = df["degree"].map(lambda x: degrees.get_name(x)) | |||
| df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x))) | |||
| df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in | |||
| str(x).split(",")])) | |||
| clms.append("industry_names") | |||
| def arr2str(a): | |||
| if not a: | |||
| return "" | |||
| if isinstance(a, list): | |||
| a = " ".join([str(i) for i in a]) | |||
| return str(a).replace(",", " ") | |||
| df["expect_industry_name"] = df["expect_industry_name"].map( | |||
| lambda x: arr2str(x)) | |||
| df["gender"] = df["gender"].map( | |||
| lambda x: "男" if x == 'M' else ( | |||
| "女" if x == 'F' else "")) | |||
| for c in ["is_fertility", "is_oversea", "is_house", | |||
| "management_experience", "marital"]: | |||
| df[c] = df[c].map( | |||
| lambda x: '是' if x == 'Y' else ( | |||
| '否' if x == 'N' else "")) | |||
| df["is_management_experience"] = df["management_experience"] | |||
| df["is_marital"] = df["marital"] | |||
| clms.extend(["is_management_experience", "is_marital"]) | |||
| df.fillna("", inplace=True) | |||
| for i in range(len(df)): | |||
| if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip(): | |||
| df.loc[i, "phone"] = df.loc[i, "tel"].strip() | |||
| for n in ["industry_ids", "management_experience", "marital", "tel"]: | |||
| for i in range(len(clms)): | |||
| if clms[i] == n: | |||
| del clms[i] | |||
| break | |||
| clms = list(set(clms)) | |||
| df = df.reindex(sorted(clms), axis=1) | |||
| #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL") | |||
| for c in clms: | |||
| df[c] = df[c].map( | |||
| lambda s: str(s).replace( | |||
| "\t", | |||
| " ").replace( | |||
| "\n", | |||
| "\\n").replace( | |||
| "\r", | |||
| "\\n")) | |||
| # print(df.values.tolist()) | |||
| return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0])) | |||
| @@ -1,61 +1,61 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import pdfplumber | |||
| from .ocr import OCR | |||
| from .recognizer import Recognizer | |||
| from .layout_recognizer import LayoutRecognizer | |||
| from .table_structure_recognizer import TableStructureRecognizer | |||
| def init_in_out(args): | |||
| from PIL import Image | |||
| import os | |||
| import traceback | |||
| from api.utils.file_utils import traversal_files | |||
| images = [] | |||
| outputs = [] | |||
| if not os.path.exists(args.output_dir): | |||
| os.mkdir(args.output_dir) | |||
| def pdf_pages(fnm, zoomin=3): | |||
| nonlocal outputs, images | |||
| pdf = pdfplumber.open(fnm) | |||
| images = [p.to_image(resolution=72 * zoomin).annotated for i, p in | |||
| enumerate(pdf.pages)] | |||
| for i, page in enumerate(images): | |||
| outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") | |||
| def images_and_outputs(fnm): | |||
| nonlocal outputs, images | |||
| if fnm.split(".")[-1].lower() == "pdf": | |||
| pdf_pages(fnm) | |||
| return | |||
| try: | |||
| images.append(Image.open(fnm)) | |||
| outputs.append(os.path.split(fnm)[-1]) | |||
| except Exception as e: | |||
| traceback.print_exc() | |||
| if os.path.isdir(args.inputs): | |||
| for fnm in traversal_files(args.inputs): | |||
| images_and_outputs(fnm) | |||
| else: | |||
| images_and_outputs(args.inputs) | |||
| for i in range(len(outputs)): outputs[i] = os.path.join(args.output_dir, outputs[i]) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import pdfplumber | |||
| from .ocr import OCR | |||
| from .recognizer import Recognizer | |||
| from .layout_recognizer import LayoutRecognizer | |||
| from .table_structure_recognizer import TableStructureRecognizer | |||
| def init_in_out(args): | |||
| from PIL import Image | |||
| import os | |||
| import traceback | |||
| from api.utils.file_utils import traversal_files | |||
| images = [] | |||
| outputs = [] | |||
| if not os.path.exists(args.output_dir): | |||
| os.mkdir(args.output_dir) | |||
| def pdf_pages(fnm, zoomin=3): | |||
| nonlocal outputs, images | |||
| pdf = pdfplumber.open(fnm) | |||
| images = [p.to_image(resolution=72 * zoomin).annotated for i, p in | |||
| enumerate(pdf.pages)] | |||
| for i, page in enumerate(images): | |||
| outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") | |||
| def images_and_outputs(fnm): | |||
| nonlocal outputs, images | |||
| if fnm.split(".")[-1].lower() == "pdf": | |||
| pdf_pages(fnm) | |||
| return | |||
| try: | |||
| images.append(Image.open(fnm)) | |||
| outputs.append(os.path.split(fnm)[-1]) | |||
| except Exception as e: | |||
| traceback.print_exc() | |||
| if os.path.isdir(args.inputs): | |||
| for fnm in traversal_files(args.inputs): | |||
| images_and_outputs(fnm) | |||
| else: | |||
| images_and_outputs(args.inputs) | |||
| for i in range(len(outputs)): outputs[i] = os.path.join(args.output_dir, outputs[i]) | |||
| return images, outputs | |||
| @@ -1,151 +1,151 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import re | |||
| from collections import Counter | |||
| from copy import deepcopy | |||
| import numpy as np | |||
| from huggingface_hub import snapshot_download | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from deepdoc.vision import Recognizer | |||
| class LayoutRecognizer(Recognizer): | |||
| labels = [ | |||
| "_background_", | |||
| "Text", | |||
| "Title", | |||
| "Figure", | |||
| "Figure caption", | |||
| "Table", | |||
| "Table caption", | |||
| "Header", | |||
| "Footer", | |||
| "Reference", | |||
| "Equation", | |||
| ] | |||
| def __init__(self, domain): | |||
| try: | |||
| model_dir = os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc") | |||
| super().__init__(self.labels, domain, model_dir) | |||
| except Exception as e: | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc", | |||
| local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), | |||
| local_dir_use_symlinks=False) | |||
| super().__init__(self.labels, domain, model_dir) | |||
| self.garbage_layouts = ["footer", "header", "reference"] | |||
| def __call__(self, image_list, ocr_res, scale_factor=3, | |||
| thr=0.2, batch_size=16, drop=True): | |||
| def __is_garbage(b): | |||
| patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$", | |||
| r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", | |||
| "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}", | |||
| "\\(cid *: *[0-9]+ *\\)" | |||
| ] | |||
| return any([re.search(p, b["text"]) for p in patt]) | |||
| layouts = super().__call__(image_list, thr, batch_size) | |||
| # save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7) | |||
| assert len(image_list) == len(ocr_res) | |||
| # Tag layout type | |||
| boxes = [] | |||
| assert len(image_list) == len(layouts) | |||
| garbages = {} | |||
| page_layout = [] | |||
| for pn, lts in enumerate(layouts): | |||
| bxs = ocr_res[pn] | |||
| lts = [{"type": b["type"], | |||
| "score": float(b["score"]), | |||
| "x0": b["bbox"][0] / scale_factor, "x1": b["bbox"][2] / scale_factor, | |||
| "top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor, | |||
| "page_number": pn, | |||
| } for b in lts if float(b["score"]) >= 0.8 or b["type"] not in self.garbage_layouts] | |||
| lts = self.sort_Y_firstly(lts, np.mean( | |||
| [l["bottom"] - l["top"] for l in lts]) / 2) | |||
| lts = self.layouts_cleanup(bxs, lts) | |||
| page_layout.append(lts) | |||
| # Tag layout type, layouts are ready | |||
| def findLayout(ty): | |||
| nonlocal bxs, lts, self | |||
| lts_ = [lt for lt in lts if lt["type"] == ty] | |||
| i = 0 | |||
| while i < len(bxs): | |||
| if bxs[i].get("layout_type"): | |||
| i += 1 | |||
| continue | |||
| if __is_garbage(bxs[i]): | |||
| bxs.pop(i) | |||
| continue | |||
| ii = self.find_overlapped_with_threashold(bxs[i], lts_, | |||
| thr=0.4) | |||
| if ii is None: # belong to nothing | |||
| bxs[i]["layout_type"] = "" | |||
| i += 1 | |||
| continue | |||
| lts_[ii]["visited"] = True | |||
| keep_feats = [ | |||
| lts_[ | |||
| ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor, | |||
| lts_[ | |||
| ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor, | |||
| ] | |||
| if drop and lts_[ | |||
| ii]["type"] in self.garbage_layouts and not any(keep_feats): | |||
| if lts_[ii]["type"] not in garbages: | |||
| garbages[lts_[ii]["type"]] = [] | |||
| garbages[lts_[ii]["type"]].append(bxs[i]["text"]) | |||
| bxs.pop(i) | |||
| continue | |||
| bxs[i]["layoutno"] = f"{ty}-{ii}" | |||
| bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ | |||
| ii]["type"] != "equation" else "figure" | |||
| i += 1 | |||
| for lt in ["footer", "header", "reference", "figure caption", | |||
| "table caption", "title", "table", "text", "figure", "equation"]: | |||
| findLayout(lt) | |||
| # add box to figure layouts which has not text box | |||
| for i, lt in enumerate( | |||
| [lt for lt in lts if lt["type"] in ["figure", "equation"]]): | |||
| if lt.get("visited"): | |||
| continue | |||
| lt = deepcopy(lt) | |||
| del lt["type"] | |||
| lt["text"] = "" | |||
| lt["layout_type"] = "figure" | |||
| lt["layoutno"] = f"figure-{i}" | |||
| bxs.append(lt) | |||
| boxes.extend(bxs) | |||
| ocr_res = boxes | |||
| garbag_set = set() | |||
| for k in garbages.keys(): | |||
| garbages[k] = Counter(garbages[k]) | |||
| for g, c in garbages[k].items(): | |||
| if c > 1: | |||
| garbag_set.add(g) | |||
| ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set] | |||
| return ocr_res, page_layout | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import re | |||
| from collections import Counter | |||
| from copy import deepcopy | |||
| import numpy as np | |||
| from huggingface_hub import snapshot_download | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from deepdoc.vision import Recognizer | |||
| class LayoutRecognizer(Recognizer): | |||
| labels = [ | |||
| "_background_", | |||
| "Text", | |||
| "Title", | |||
| "Figure", | |||
| "Figure caption", | |||
| "Table", | |||
| "Table caption", | |||
| "Header", | |||
| "Footer", | |||
| "Reference", | |||
| "Equation", | |||
| ] | |||
| def __init__(self, domain): | |||
| try: | |||
| model_dir = os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc") | |||
| super().__init__(self.labels, domain, model_dir) | |||
| except Exception as e: | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc", | |||
| local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), | |||
| local_dir_use_symlinks=False) | |||
| super().__init__(self.labels, domain, model_dir) | |||
| self.garbage_layouts = ["footer", "header", "reference"] | |||
| def __call__(self, image_list, ocr_res, scale_factor=3, | |||
| thr=0.2, batch_size=16, drop=True): | |||
| def __is_garbage(b): | |||
| patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$", | |||
| r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", | |||
| "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}", | |||
| "\\(cid *: *[0-9]+ *\\)" | |||
| ] | |||
| return any([re.search(p, b["text"]) for p in patt]) | |||
| layouts = super().__call__(image_list, thr, batch_size) | |||
| # save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7) | |||
| assert len(image_list) == len(ocr_res) | |||
| # Tag layout type | |||
| boxes = [] | |||
| assert len(image_list) == len(layouts) | |||
| garbages = {} | |||
| page_layout = [] | |||
| for pn, lts in enumerate(layouts): | |||
| bxs = ocr_res[pn] | |||
| lts = [{"type": b["type"], | |||
| "score": float(b["score"]), | |||
| "x0": b["bbox"][0] / scale_factor, "x1": b["bbox"][2] / scale_factor, | |||
| "top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor, | |||
| "page_number": pn, | |||
| } for b in lts if float(b["score"]) >= 0.8 or b["type"] not in self.garbage_layouts] | |||
| lts = self.sort_Y_firstly(lts, np.mean( | |||
| [l["bottom"] - l["top"] for l in lts]) / 2) | |||
| lts = self.layouts_cleanup(bxs, lts) | |||
| page_layout.append(lts) | |||
| # Tag layout type, layouts are ready | |||
| def findLayout(ty): | |||
| nonlocal bxs, lts, self | |||
| lts_ = [lt for lt in lts if lt["type"] == ty] | |||
| i = 0 | |||
| while i < len(bxs): | |||
| if bxs[i].get("layout_type"): | |||
| i += 1 | |||
| continue | |||
| if __is_garbage(bxs[i]): | |||
| bxs.pop(i) | |||
| continue | |||
| ii = self.find_overlapped_with_threashold(bxs[i], lts_, | |||
| thr=0.4) | |||
| if ii is None: # belong to nothing | |||
| bxs[i]["layout_type"] = "" | |||
| i += 1 | |||
| continue | |||
| lts_[ii]["visited"] = True | |||
| keep_feats = [ | |||
| lts_[ | |||
| ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor, | |||
| lts_[ | |||
| ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor, | |||
| ] | |||
| if drop and lts_[ | |||
| ii]["type"] in self.garbage_layouts and not any(keep_feats): | |||
| if lts_[ii]["type"] not in garbages: | |||
| garbages[lts_[ii]["type"]] = [] | |||
| garbages[lts_[ii]["type"]].append(bxs[i]["text"]) | |||
| bxs.pop(i) | |||
| continue | |||
| bxs[i]["layoutno"] = f"{ty}-{ii}" | |||
| bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ | |||
| ii]["type"] != "equation" else "figure" | |||
| i += 1 | |||
| for lt in ["footer", "header", "reference", "figure caption", | |||
| "table caption", "title", "table", "text", "figure", "equation"]: | |||
| findLayout(lt) | |||
| # add box to figure layouts which has not text box | |||
| for i, lt in enumerate( | |||
| [lt for lt in lts if lt["type"] in ["figure", "equation"]]): | |||
| if lt.get("visited"): | |||
| continue | |||
| lt = deepcopy(lt) | |||
| del lt["type"] | |||
| lt["text"] = "" | |||
| lt["layout_type"] = "figure" | |||
| lt["layoutno"] = f"figure-{i}" | |||
| bxs.append(lt) | |||
| boxes.extend(bxs) | |||
| ocr_res = boxes | |||
| garbag_set = set() | |||
| for k in garbages.keys(): | |||
| garbages[k] = Counter(garbages[k]) | |||
| for g, c in garbages[k].items(): | |||
| if c > 1: | |||
| garbag_set.add(g) | |||
| ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set] | |||
| return ocr_res, page_layout | |||
| @@ -1,366 +1,366 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| import numpy as np | |||
| import cv2 | |||
| from shapely.geometry import Polygon | |||
| import pyclipper | |||
| def build_post_process(config, global_config=None): | |||
| support_dict = ['DBPostProcess', 'CTCLabelDecode'] | |||
| config = copy.deepcopy(config) | |||
| module_name = config.pop('name') | |||
| if module_name == "None": | |||
| return | |||
| if global_config is not None: | |||
| config.update(global_config) | |||
| assert module_name in support_dict, Exception( | |||
| 'post process only support {}'.format(support_dict)) | |||
| module_class = eval(module_name)(**config) | |||
| return module_class | |||
| class DBPostProcess(object): | |||
| """ | |||
| The post process for Differentiable Binarization (DB). | |||
| """ | |||
| def __init__(self, | |||
| thresh=0.3, | |||
| box_thresh=0.7, | |||
| max_candidates=1000, | |||
| unclip_ratio=2.0, | |||
| use_dilation=False, | |||
| score_mode="fast", | |||
| box_type='quad', | |||
| **kwargs): | |||
| self.thresh = thresh | |||
| self.box_thresh = box_thresh | |||
| self.max_candidates = max_candidates | |||
| self.unclip_ratio = unclip_ratio | |||
| self.min_size = 3 | |||
| self.score_mode = score_mode | |||
| self.box_type = box_type | |||
| assert score_mode in [ | |||
| "slow", "fast" | |||
| ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) | |||
| self.dilation_kernel = None if not use_dilation else np.array( | |||
| [[1, 1], [1, 1]]) | |||
| def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height): | |||
| ''' | |||
| _bitmap: single map with shape (1, H, W), | |||
| whose values are binarized as {0, 1} | |||
| ''' | |||
| bitmap = _bitmap | |||
| height, width = bitmap.shape | |||
| boxes = [] | |||
| scores = [] | |||
| contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), | |||
| cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) | |||
| for contour in contours[:self.max_candidates]: | |||
| epsilon = 0.002 * cv2.arcLength(contour, True) | |||
| approx = cv2.approxPolyDP(contour, epsilon, True) | |||
| points = approx.reshape((-1, 2)) | |||
| if points.shape[0] < 4: | |||
| continue | |||
| score = self.box_score_fast(pred, points.reshape(-1, 2)) | |||
| if self.box_thresh > score: | |||
| continue | |||
| if points.shape[0] > 2: | |||
| box = self.unclip(points, self.unclip_ratio) | |||
| if len(box) > 1: | |||
| continue | |||
| else: | |||
| continue | |||
| box = box.reshape(-1, 2) | |||
| _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2))) | |||
| if sside < self.min_size + 2: | |||
| continue | |||
| box = np.array(box) | |||
| box[:, 0] = np.clip( | |||
| np.round(box[:, 0] / width * dest_width), 0, dest_width) | |||
| box[:, 1] = np.clip( | |||
| np.round(box[:, 1] / height * dest_height), 0, dest_height) | |||
| boxes.append(box.tolist()) | |||
| scores.append(score) | |||
| return boxes, scores | |||
| def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): | |||
| ''' | |||
| _bitmap: single map with shape (1, H, W), | |||
| whose values are binarized as {0, 1} | |||
| ''' | |||
| bitmap = _bitmap | |||
| height, width = bitmap.shape | |||
| outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, | |||
| cv2.CHAIN_APPROX_SIMPLE) | |||
| if len(outs) == 3: | |||
| img, contours, _ = outs[0], outs[1], outs[2] | |||
| elif len(outs) == 2: | |||
| contours, _ = outs[0], outs[1] | |||
| num_contours = min(len(contours), self.max_candidates) | |||
| boxes = [] | |||
| scores = [] | |||
| for index in range(num_contours): | |||
| contour = contours[index] | |||
| points, sside = self.get_mini_boxes(contour) | |||
| if sside < self.min_size: | |||
| continue | |||
| points = np.array(points) | |||
| if self.score_mode == "fast": | |||
| score = self.box_score_fast(pred, points.reshape(-1, 2)) | |||
| else: | |||
| score = self.box_score_slow(pred, contour) | |||
| if self.box_thresh > score: | |||
| continue | |||
| box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2) | |||
| box, sside = self.get_mini_boxes(box) | |||
| if sside < self.min_size + 2: | |||
| continue | |||
| box = np.array(box) | |||
| box[:, 0] = np.clip( | |||
| np.round(box[:, 0] / width * dest_width), 0, dest_width) | |||
| box[:, 1] = np.clip( | |||
| np.round(box[:, 1] / height * dest_height), 0, dest_height) | |||
| boxes.append(box.astype("int32")) | |||
| scores.append(score) | |||
| return np.array(boxes, dtype="int32"), scores | |||
| def unclip(self, box, unclip_ratio): | |||
| poly = Polygon(box) | |||
| distance = poly.area * unclip_ratio / poly.length | |||
| offset = pyclipper.PyclipperOffset() | |||
| offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) | |||
| expanded = np.array(offset.Execute(distance)) | |||
| return expanded | |||
| def get_mini_boxes(self, contour): | |||
| bounding_box = cv2.minAreaRect(contour) | |||
| points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) | |||
| index_1, index_2, index_3, index_4 = 0, 1, 2, 3 | |||
| if points[1][1] > points[0][1]: | |||
| index_1 = 0 | |||
| index_4 = 1 | |||
| else: | |||
| index_1 = 1 | |||
| index_4 = 0 | |||
| if points[3][1] > points[2][1]: | |||
| index_2 = 2 | |||
| index_3 = 3 | |||
| else: | |||
| index_2 = 3 | |||
| index_3 = 2 | |||
| box = [ | |||
| points[index_1], points[index_2], points[index_3], points[index_4] | |||
| ] | |||
| return box, min(bounding_box[1]) | |||
| def box_score_fast(self, bitmap, _box): | |||
| ''' | |||
| box_score_fast: use bbox mean score as the mean score | |||
| ''' | |||
| h, w = bitmap.shape[:2] | |||
| box = _box.copy() | |||
| xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1) | |||
| xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1) | |||
| ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1) | |||
| ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1) | |||
| mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) | |||
| box[:, 0] = box[:, 0] - xmin | |||
| box[:, 1] = box[:, 1] - ymin | |||
| cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1) | |||
| return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] | |||
| def box_score_slow(self, bitmap, contour): | |||
| ''' | |||
| box_score_slow: use polyon mean score as the mean score | |||
| ''' | |||
| h, w = bitmap.shape[:2] | |||
| contour = contour.copy() | |||
| contour = np.reshape(contour, (-1, 2)) | |||
| xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) | |||
| xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) | |||
| ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) | |||
| ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) | |||
| mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) | |||
| contour[:, 0] = contour[:, 0] - xmin | |||
| contour[:, 1] = contour[:, 1] - ymin | |||
| cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1) | |||
| return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] | |||
| def __call__(self, outs_dict, shape_list): | |||
| pred = outs_dict['maps'] | |||
| if not isinstance(pred, np.ndarray): | |||
| pred = pred.numpy() | |||
| pred = pred[:, 0, :, :] | |||
| segmentation = pred > self.thresh | |||
| boxes_batch = [] | |||
| for batch_index in range(pred.shape[0]): | |||
| src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] | |||
| if self.dilation_kernel is not None: | |||
| mask = cv2.dilate( | |||
| np.array(segmentation[batch_index]).astype(np.uint8), | |||
| self.dilation_kernel) | |||
| else: | |||
| mask = segmentation[batch_index] | |||
| if self.box_type == 'poly': | |||
| boxes, scores = self.polygons_from_bitmap(pred[batch_index], | |||
| mask, src_w, src_h) | |||
| elif self.box_type == 'quad': | |||
| boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, | |||
| src_w, src_h) | |||
| else: | |||
| raise ValueError( | |||
| "box_type can only be one of ['quad', 'poly']") | |||
| boxes_batch.append({'points': boxes}) | |||
| return boxes_batch | |||
| class BaseRecLabelDecode(object): | |||
| """ Convert between text-label and text-index """ | |||
| def __init__(self, character_dict_path=None, use_space_char=False): | |||
| self.beg_str = "sos" | |||
| self.end_str = "eos" | |||
| self.reverse = False | |||
| self.character_str = [] | |||
| if character_dict_path is None: | |||
| self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" | |||
| dict_character = list(self.character_str) | |||
| else: | |||
| with open(character_dict_path, "rb") as fin: | |||
| lines = fin.readlines() | |||
| for line in lines: | |||
| line = line.decode('utf-8').strip("\n").strip("\r\n") | |||
| self.character_str.append(line) | |||
| if use_space_char: | |||
| self.character_str.append(" ") | |||
| dict_character = list(self.character_str) | |||
| if 'arabic' in character_dict_path: | |||
| self.reverse = True | |||
| dict_character = self.add_special_char(dict_character) | |||
| self.dict = {} | |||
| for i, char in enumerate(dict_character): | |||
| self.dict[char] = i | |||
| self.character = dict_character | |||
| def pred_reverse(self, pred): | |||
| pred_re = [] | |||
| c_current = '' | |||
| for c in pred: | |||
| if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)): | |||
| if c_current != '': | |||
| pred_re.append(c_current) | |||
| pred_re.append(c) | |||
| c_current = '' | |||
| else: | |||
| c_current += c | |||
| if c_current != '': | |||
| pred_re.append(c_current) | |||
| return ''.join(pred_re[::-1]) | |||
| def add_special_char(self, dict_character): | |||
| return dict_character | |||
| def decode(self, text_index, text_prob=None, is_remove_duplicate=False): | |||
| """ convert text-index into text-label. """ | |||
| result_list = [] | |||
| ignored_tokens = self.get_ignored_tokens() | |||
| batch_size = len(text_index) | |||
| for batch_idx in range(batch_size): | |||
| selection = np.ones(len(text_index[batch_idx]), dtype=bool) | |||
| if is_remove_duplicate: | |||
| selection[1:] = text_index[batch_idx][1:] != text_index[ | |||
| batch_idx][:-1] | |||
| for ignored_token in ignored_tokens: | |||
| selection &= text_index[batch_idx] != ignored_token | |||
| char_list = [ | |||
| self.character[text_id] | |||
| for text_id in text_index[batch_idx][selection] | |||
| ] | |||
| if text_prob is not None: | |||
| conf_list = text_prob[batch_idx][selection] | |||
| else: | |||
| conf_list = [1] * len(selection) | |||
| if len(conf_list) == 0: | |||
| conf_list = [0] | |||
| text = ''.join(char_list) | |||
| if self.reverse: # for arabic rec | |||
| text = self.pred_reverse(text) | |||
| result_list.append((text, np.mean(conf_list).tolist())) | |||
| return result_list | |||
| def get_ignored_tokens(self): | |||
| return [0] # for ctc blank | |||
| class CTCLabelDecode(BaseRecLabelDecode): | |||
| """ Convert between text-label and text-index """ | |||
| def __init__(self, character_dict_path=None, use_space_char=False, | |||
| **kwargs): | |||
| super(CTCLabelDecode, self).__init__(character_dict_path, | |||
| use_space_char) | |||
| def __call__(self, preds, label=None, *args, **kwargs): | |||
| if isinstance(preds, tuple) or isinstance(preds, list): | |||
| preds = preds[-1] | |||
| if not isinstance(preds, np.ndarray): | |||
| preds = preds.numpy() | |||
| preds_idx = preds.argmax(axis=2) | |||
| preds_prob = preds.max(axis=2) | |||
| text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) | |||
| if label is None: | |||
| return text | |||
| label = self.decode(label) | |||
| return text, label | |||
| def add_special_char(self, dict_character): | |||
| dict_character = ['blank'] + dict_character | |||
| return dict_character | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| import numpy as np | |||
| import cv2 | |||
| from shapely.geometry import Polygon | |||
| import pyclipper | |||
| def build_post_process(config, global_config=None): | |||
| support_dict = ['DBPostProcess', 'CTCLabelDecode'] | |||
| config = copy.deepcopy(config) | |||
| module_name = config.pop('name') | |||
| if module_name == "None": | |||
| return | |||
| if global_config is not None: | |||
| config.update(global_config) | |||
| assert module_name in support_dict, Exception( | |||
| 'post process only support {}'.format(support_dict)) | |||
| module_class = eval(module_name)(**config) | |||
| return module_class | |||
| class DBPostProcess(object): | |||
| """ | |||
| The post process for Differentiable Binarization (DB). | |||
| """ | |||
| def __init__(self, | |||
| thresh=0.3, | |||
| box_thresh=0.7, | |||
| max_candidates=1000, | |||
| unclip_ratio=2.0, | |||
| use_dilation=False, | |||
| score_mode="fast", | |||
| box_type='quad', | |||
| **kwargs): | |||
| self.thresh = thresh | |||
| self.box_thresh = box_thresh | |||
| self.max_candidates = max_candidates | |||
| self.unclip_ratio = unclip_ratio | |||
| self.min_size = 3 | |||
| self.score_mode = score_mode | |||
| self.box_type = box_type | |||
| assert score_mode in [ | |||
| "slow", "fast" | |||
| ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) | |||
| self.dilation_kernel = None if not use_dilation else np.array( | |||
| [[1, 1], [1, 1]]) | |||
| def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height): | |||
| ''' | |||
| _bitmap: single map with shape (1, H, W), | |||
| whose values are binarized as {0, 1} | |||
| ''' | |||
| bitmap = _bitmap | |||
| height, width = bitmap.shape | |||
| boxes = [] | |||
| scores = [] | |||
| contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), | |||
| cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) | |||
| for contour in contours[:self.max_candidates]: | |||
| epsilon = 0.002 * cv2.arcLength(contour, True) | |||
| approx = cv2.approxPolyDP(contour, epsilon, True) | |||
| points = approx.reshape((-1, 2)) | |||
| if points.shape[0] < 4: | |||
| continue | |||
| score = self.box_score_fast(pred, points.reshape(-1, 2)) | |||
| if self.box_thresh > score: | |||
| continue | |||
| if points.shape[0] > 2: | |||
| box = self.unclip(points, self.unclip_ratio) | |||
| if len(box) > 1: | |||
| continue | |||
| else: | |||
| continue | |||
| box = box.reshape(-1, 2) | |||
| _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2))) | |||
| if sside < self.min_size + 2: | |||
| continue | |||
| box = np.array(box) | |||
| box[:, 0] = np.clip( | |||
| np.round(box[:, 0] / width * dest_width), 0, dest_width) | |||
| box[:, 1] = np.clip( | |||
| np.round(box[:, 1] / height * dest_height), 0, dest_height) | |||
| boxes.append(box.tolist()) | |||
| scores.append(score) | |||
| return boxes, scores | |||
| def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): | |||
| ''' | |||
| _bitmap: single map with shape (1, H, W), | |||
| whose values are binarized as {0, 1} | |||
| ''' | |||
| bitmap = _bitmap | |||
| height, width = bitmap.shape | |||
| outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, | |||
| cv2.CHAIN_APPROX_SIMPLE) | |||
| if len(outs) == 3: | |||
| img, contours, _ = outs[0], outs[1], outs[2] | |||
| elif len(outs) == 2: | |||
| contours, _ = outs[0], outs[1] | |||
| num_contours = min(len(contours), self.max_candidates) | |||
| boxes = [] | |||
| scores = [] | |||
| for index in range(num_contours): | |||
| contour = contours[index] | |||
| points, sside = self.get_mini_boxes(contour) | |||
| if sside < self.min_size: | |||
| continue | |||
| points = np.array(points) | |||
| if self.score_mode == "fast": | |||
| score = self.box_score_fast(pred, points.reshape(-1, 2)) | |||
| else: | |||
| score = self.box_score_slow(pred, contour) | |||
| if self.box_thresh > score: | |||
| continue | |||
| box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2) | |||
| box, sside = self.get_mini_boxes(box) | |||
| if sside < self.min_size + 2: | |||
| continue | |||
| box = np.array(box) | |||
| box[:, 0] = np.clip( | |||
| np.round(box[:, 0] / width * dest_width), 0, dest_width) | |||
| box[:, 1] = np.clip( | |||
| np.round(box[:, 1] / height * dest_height), 0, dest_height) | |||
| boxes.append(box.astype("int32")) | |||
| scores.append(score) | |||
| return np.array(boxes, dtype="int32"), scores | |||
| def unclip(self, box, unclip_ratio): | |||
| poly = Polygon(box) | |||
| distance = poly.area * unclip_ratio / poly.length | |||
| offset = pyclipper.PyclipperOffset() | |||
| offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) | |||
| expanded = np.array(offset.Execute(distance)) | |||
| return expanded | |||
| def get_mini_boxes(self, contour): | |||
| bounding_box = cv2.minAreaRect(contour) | |||
| points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) | |||
| index_1, index_2, index_3, index_4 = 0, 1, 2, 3 | |||
| if points[1][1] > points[0][1]: | |||
| index_1 = 0 | |||
| index_4 = 1 | |||
| else: | |||
| index_1 = 1 | |||
| index_4 = 0 | |||
| if points[3][1] > points[2][1]: | |||
| index_2 = 2 | |||
| index_3 = 3 | |||
| else: | |||
| index_2 = 3 | |||
| index_3 = 2 | |||
| box = [ | |||
| points[index_1], points[index_2], points[index_3], points[index_4] | |||
| ] | |||
| return box, min(bounding_box[1]) | |||
| def box_score_fast(self, bitmap, _box): | |||
| ''' | |||
| box_score_fast: use bbox mean score as the mean score | |||
| ''' | |||
| h, w = bitmap.shape[:2] | |||
| box = _box.copy() | |||
| xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1) | |||
| xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1) | |||
| ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1) | |||
| ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1) | |||
| mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) | |||
| box[:, 0] = box[:, 0] - xmin | |||
| box[:, 1] = box[:, 1] - ymin | |||
| cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1) | |||
| return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] | |||
| def box_score_slow(self, bitmap, contour): | |||
| ''' | |||
| box_score_slow: use polyon mean score as the mean score | |||
| ''' | |||
| h, w = bitmap.shape[:2] | |||
| contour = contour.copy() | |||
| contour = np.reshape(contour, (-1, 2)) | |||
| xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) | |||
| xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) | |||
| ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) | |||
| ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) | |||
| mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) | |||
| contour[:, 0] = contour[:, 0] - xmin | |||
| contour[:, 1] = contour[:, 1] - ymin | |||
| cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1) | |||
| return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] | |||
| def __call__(self, outs_dict, shape_list): | |||
| pred = outs_dict['maps'] | |||
| if not isinstance(pred, np.ndarray): | |||
| pred = pred.numpy() | |||
| pred = pred[:, 0, :, :] | |||
| segmentation = pred > self.thresh | |||
| boxes_batch = [] | |||
| for batch_index in range(pred.shape[0]): | |||
| src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] | |||
| if self.dilation_kernel is not None: | |||
| mask = cv2.dilate( | |||
| np.array(segmentation[batch_index]).astype(np.uint8), | |||
| self.dilation_kernel) | |||
| else: | |||
| mask = segmentation[batch_index] | |||
| if self.box_type == 'poly': | |||
| boxes, scores = self.polygons_from_bitmap(pred[batch_index], | |||
| mask, src_w, src_h) | |||
| elif self.box_type == 'quad': | |||
| boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, | |||
| src_w, src_h) | |||
| else: | |||
| raise ValueError( | |||
| "box_type can only be one of ['quad', 'poly']") | |||
| boxes_batch.append({'points': boxes}) | |||
| return boxes_batch | |||
| class BaseRecLabelDecode(object): | |||
| """ Convert between text-label and text-index """ | |||
| def __init__(self, character_dict_path=None, use_space_char=False): | |||
| self.beg_str = "sos" | |||
| self.end_str = "eos" | |||
| self.reverse = False | |||
| self.character_str = [] | |||
| if character_dict_path is None: | |||
| self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" | |||
| dict_character = list(self.character_str) | |||
| else: | |||
| with open(character_dict_path, "rb") as fin: | |||
| lines = fin.readlines() | |||
| for line in lines: | |||
| line = line.decode('utf-8').strip("\n").strip("\r\n") | |||
| self.character_str.append(line) | |||
| if use_space_char: | |||
| self.character_str.append(" ") | |||
| dict_character = list(self.character_str) | |||
| if 'arabic' in character_dict_path: | |||
| self.reverse = True | |||
| dict_character = self.add_special_char(dict_character) | |||
| self.dict = {} | |||
| for i, char in enumerate(dict_character): | |||
| self.dict[char] = i | |||
| self.character = dict_character | |||
| def pred_reverse(self, pred): | |||
| pred_re = [] | |||
| c_current = '' | |||
| for c in pred: | |||
| if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)): | |||
| if c_current != '': | |||
| pred_re.append(c_current) | |||
| pred_re.append(c) | |||
| c_current = '' | |||
| else: | |||
| c_current += c | |||
| if c_current != '': | |||
| pred_re.append(c_current) | |||
| return ''.join(pred_re[::-1]) | |||
| def add_special_char(self, dict_character): | |||
| return dict_character | |||
| def decode(self, text_index, text_prob=None, is_remove_duplicate=False): | |||
| """ convert text-index into text-label. """ | |||
| result_list = [] | |||
| ignored_tokens = self.get_ignored_tokens() | |||
| batch_size = len(text_index) | |||
| for batch_idx in range(batch_size): | |||
| selection = np.ones(len(text_index[batch_idx]), dtype=bool) | |||
| if is_remove_duplicate: | |||
| selection[1:] = text_index[batch_idx][1:] != text_index[ | |||
| batch_idx][:-1] | |||
| for ignored_token in ignored_tokens: | |||
| selection &= text_index[batch_idx] != ignored_token | |||
| char_list = [ | |||
| self.character[text_id] | |||
| for text_id in text_index[batch_idx][selection] | |||
| ] | |||
| if text_prob is not None: | |||
| conf_list = text_prob[batch_idx][selection] | |||
| else: | |||
| conf_list = [1] * len(selection) | |||
| if len(conf_list) == 0: | |||
| conf_list = [0] | |||
| text = ''.join(char_list) | |||
| if self.reverse: # for arabic rec | |||
| text = self.pred_reverse(text) | |||
| result_list.append((text, np.mean(conf_list).tolist())) | |||
| return result_list | |||
| def get_ignored_tokens(self): | |||
| return [0] # for ctc blank | |||
| class CTCLabelDecode(BaseRecLabelDecode): | |||
| """ Convert between text-label and text-index """ | |||
| def __init__(self, character_dict_path=None, use_space_char=False, | |||
| **kwargs): | |||
| super(CTCLabelDecode, self).__init__(character_dict_path, | |||
| use_space_char) | |||
| def __call__(self, preds, label=None, *args, **kwargs): | |||
| if isinstance(preds, tuple) or isinstance(preds, list): | |||
| preds = preds[-1] | |||
| if not isinstance(preds, np.ndarray): | |||
| preds = preds.numpy() | |||
| preds_idx = preds.argmax(axis=2) | |||
| preds_prob = preds.max(axis=2) | |||
| text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) | |||
| if label is None: | |||
| return text | |||
| label = self.decode(label) | |||
| return text, label | |||
| def add_special_char(self, dict_character): | |||
| dict_character = ['blank'] + dict_character | |||
| return dict_character | |||
| @@ -1,452 +1,452 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| from copy import deepcopy | |||
| import onnxruntime as ort | |||
| from huggingface_hub import snapshot_download | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from .operators import * | |||
| class Recognizer(object): | |||
| def __init__(self, label_list, task_name, model_dir=None): | |||
| """ | |||
| If you have trouble downloading HuggingFace models, -_^ this might help!! | |||
| For Linux: | |||
| export HF_ENDPOINT=https://hf-mirror.com | |||
| For Windows: | |||
| Good luck | |||
| ^_- | |||
| """ | |||
| if not model_dir: | |||
| model_dir = os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc") | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| if not os.path.exists(model_file_path): | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc", | |||
| local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), | |||
| local_dir_use_symlinks=False) | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| else: | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| if not os.path.exists(model_file_path): | |||
| raise ValueError("not find model file path {}".format( | |||
| model_file_path)) | |||
| if False and ort.get_device() == "GPU": | |||
| options = ort.SessionOptions() | |||
| options.enable_cpu_mem_arena = False | |||
| self.ort_sess = ort.InferenceSession(model_file_path, options=options, providers=[('CUDAExecutionProvider')]) | |||
| else: | |||
| self.ort_sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider']) | |||
| self.input_names = [node.name for node in self.ort_sess.get_inputs()] | |||
| self.output_names = [node.name for node in self.ort_sess.get_outputs()] | |||
| self.input_shape = self.ort_sess.get_inputs()[0].shape[2:4] | |||
| self.label_list = label_list | |||
| @staticmethod | |||
| def sort_Y_firstly(arr, threashold): | |||
| # sort using y1 first and then x1 | |||
| arr = sorted(arr, key=lambda r: (r["top"], r["x0"])) | |||
| for i in range(len(arr) - 1): | |||
| for j in range(i, -1, -1): | |||
| # restore the order using th | |||
| if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \ | |||
| and arr[j + 1]["x0"] < arr[j]["x0"]: | |||
| tmp = deepcopy(arr[j]) | |||
| arr[j] = deepcopy(arr[j + 1]) | |||
| arr[j + 1] = deepcopy(tmp) | |||
| return arr | |||
| @staticmethod | |||
| def sort_X_firstly(arr, threashold, copy=True): | |||
| # sort using y1 first and then x1 | |||
| arr = sorted(arr, key=lambda r: (r["x0"], r["top"])) | |||
| for i in range(len(arr) - 1): | |||
| for j in range(i, -1, -1): | |||
| # restore the order using th | |||
| if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \ | |||
| and arr[j + 1]["top"] < arr[j]["top"]: | |||
| tmp = deepcopy(arr[j]) if copy else arr[j] | |||
| arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1] | |||
| arr[j + 1] = deepcopy(tmp) if copy else tmp | |||
| return arr | |||
| @staticmethod | |||
| def sort_C_firstly(arr, thr=0): | |||
| # sort using y1 first and then x1 | |||
| # sorted(arr, key=lambda r: (r["x0"], r["top"])) | |||
| arr = Recognizer.sort_X_firstly(arr, thr) | |||
| for i in range(len(arr) - 1): | |||
| for j in range(i, -1, -1): | |||
| # restore the order using th | |||
| if "C" not in arr[j] or "C" not in arr[j + 1]: | |||
| continue | |||
| if arr[j + 1]["C"] < arr[j]["C"] \ | |||
| or ( | |||
| arr[j + 1]["C"] == arr[j]["C"] | |||
| and arr[j + 1]["top"] < arr[j]["top"] | |||
| ): | |||
| tmp = arr[j] | |||
| arr[j] = arr[j + 1] | |||
| arr[j + 1] = tmp | |||
| return arr | |||
| return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"])) | |||
| @staticmethod | |||
| def sort_R_firstly(arr, thr=0): | |||
| # sort using y1 first and then x1 | |||
| # sorted(arr, key=lambda r: (r["top"], r["x0"])) | |||
| arr = Recognizer.sort_Y_firstly(arr, thr) | |||
| for i in range(len(arr) - 1): | |||
| for j in range(i, -1, -1): | |||
| if "R" not in arr[j] or "R" not in arr[j + 1]: | |||
| continue | |||
| if arr[j + 1]["R"] < arr[j]["R"] \ | |||
| or ( | |||
| arr[j + 1]["R"] == arr[j]["R"] | |||
| and arr[j + 1]["x0"] < arr[j]["x0"] | |||
| ): | |||
| tmp = arr[j] | |||
| arr[j] = arr[j + 1] | |||
| arr[j + 1] = tmp | |||
| return arr | |||
| @staticmethod | |||
| def overlapped_area(a, b, ratio=True): | |||
| tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"] | |||
| if b["x0"] > x1 or b["x1"] < x0: | |||
| return 0 | |||
| if b["bottom"] < tp or b["top"] > btm: | |||
| return 0 | |||
| x0_ = max(b["x0"], x0) | |||
| x1_ = min(b["x1"], x1) | |||
| assert x0_ <= x1_, "Fuckedup! T:{},B:{},X0:{},X1:{} ==> {}".format( | |||
| tp, btm, x0, x1, b) | |||
| tp_ = max(b["top"], tp) | |||
| btm_ = min(b["bottom"], btm) | |||
| assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format( | |||
| tp, btm, x0, x1, b) | |||
| ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \ | |||
| x0 != 0 and btm - tp != 0 else 0 | |||
| if ov > 0 and ratio: | |||
| ov /= (x1 - x0) * (btm - tp) | |||
| return ov | |||
| @staticmethod | |||
| def layouts_cleanup(boxes, layouts, far=2, thr=0.7): | |||
| def notOverlapped(a, b): | |||
| return any([a["x1"] < b["x0"], | |||
| a["x0"] > b["x1"], | |||
| a["bottom"] < b["top"], | |||
| a["top"] > b["bottom"]]) | |||
| i = 0 | |||
| while i + 1 < len(layouts): | |||
| j = i + 1 | |||
| while j < min(i + far, len(layouts)) \ | |||
| and (layouts[i].get("type", "") != layouts[j].get("type", "") | |||
| or notOverlapped(layouts[i], layouts[j])): | |||
| j += 1 | |||
| if j >= min(i + far, len(layouts)): | |||
| i += 1 | |||
| continue | |||
| if Recognizer.overlapped_area(layouts[i], layouts[j]) < thr \ | |||
| and Recognizer.overlapped_area(layouts[j], layouts[i]) < thr: | |||
| i += 1 | |||
| continue | |||
| if layouts[i].get("score") and layouts[j].get("score"): | |||
| if layouts[i]["score"] > layouts[j]["score"]: | |||
| layouts.pop(j) | |||
| else: | |||
| layouts.pop(i) | |||
| continue | |||
| area_i, area_i_1 = 0, 0 | |||
| for b in boxes: | |||
| if not notOverlapped(b, layouts[i]): | |||
| area_i += Recognizer.overlapped_area(b, layouts[i], False) | |||
| if not notOverlapped(b, layouts[j]): | |||
| area_i_1 += Recognizer.overlapped_area(b, layouts[j], False) | |||
| if area_i > area_i_1: | |||
| layouts.pop(j) | |||
| else: | |||
| layouts.pop(i) | |||
| return layouts | |||
| def create_inputs(self, imgs, im_info): | |||
| """generate input for different model type | |||
| Args: | |||
| imgs (list(numpy)): list of images (np.ndarray) | |||
| im_info (list(dict)): list of image info | |||
| Returns: | |||
| inputs (dict): input of model | |||
| """ | |||
| inputs = {} | |||
| im_shape = [] | |||
| scale_factor = [] | |||
| if len(imgs) == 1: | |||
| inputs['image'] = np.array((imgs[0],)).astype('float32') | |||
| inputs['im_shape'] = np.array( | |||
| (im_info[0]['im_shape'],)).astype('float32') | |||
| inputs['scale_factor'] = np.array( | |||
| (im_info[0]['scale_factor'],)).astype('float32') | |||
| return inputs | |||
| for e in im_info: | |||
| im_shape.append(np.array((e['im_shape'],)).astype('float32')) | |||
| scale_factor.append(np.array((e['scale_factor'],)).astype('float32')) | |||
| inputs['im_shape'] = np.concatenate(im_shape, axis=0) | |||
| inputs['scale_factor'] = np.concatenate(scale_factor, axis=0) | |||
| imgs_shape = [[e.shape[1], e.shape[2]] for e in imgs] | |||
| max_shape_h = max([e[0] for e in imgs_shape]) | |||
| max_shape_w = max([e[1] for e in imgs_shape]) | |||
| padding_imgs = [] | |||
| for img in imgs: | |||
| im_c, im_h, im_w = img.shape[:] | |||
| padding_im = np.zeros( | |||
| (im_c, max_shape_h, max_shape_w), dtype=np.float32) | |||
| padding_im[:, :im_h, :im_w] = img | |||
| padding_imgs.append(padding_im) | |||
| inputs['image'] = np.stack(padding_imgs, axis=0) | |||
| return inputs | |||
| @staticmethod | |||
| def find_overlapped(box, boxes_sorted_by_y, naive=False): | |||
| if not boxes_sorted_by_y: | |||
| return | |||
| bxs = boxes_sorted_by_y | |||
| s, e, ii = 0, len(bxs), 0 | |||
| while s < e and not naive: | |||
| ii = (e + s) // 2 | |||
| pv = bxs[ii] | |||
| if box["bottom"] < pv["top"]: | |||
| e = ii | |||
| continue | |||
| if box["top"] > pv["bottom"]: | |||
| s = ii + 1 | |||
| continue | |||
| break | |||
| while s < ii: | |||
| if box["top"] > bxs[s]["bottom"]: | |||
| s += 1 | |||
| break | |||
| while e - 1 > ii: | |||
| if box["bottom"] < bxs[e - 1]["top"]: | |||
| e -= 1 | |||
| break | |||
| max_overlaped_i, max_overlaped = None, 0 | |||
| for i in range(s, e): | |||
| ov = Recognizer.overlapped_area(bxs[i], box) | |||
| if ov <= max_overlaped: | |||
| continue | |||
| max_overlaped_i = i | |||
| max_overlaped = ov | |||
| return max_overlaped_i | |||
| @staticmethod | |||
| def find_horizontally_tightest_fit(box, boxes): | |||
| if not boxes: | |||
| return | |||
| min_dis, min_i = 1000000, None | |||
| for i,b in enumerate(boxes): | |||
| if box.get("layoutno", "0") != b.get("layoutno", "0"): continue | |||
| dis = min(abs(box["x0"] - b["x0"]), abs(box["x1"] - b["x1"]), abs(box["x0"]+box["x1"] - b["x1"] - b["x0"])/2) | |||
| if dis < min_dis: | |||
| min_i = i | |||
| min_dis = dis | |||
| return min_i | |||
| @staticmethod | |||
| def find_overlapped_with_threashold(box, boxes, thr=0.3): | |||
| if not boxes: | |||
| return | |||
| max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 | |||
| s, e = 0, len(boxes) | |||
| for i in range(s, e): | |||
| ov = Recognizer.overlapped_area(box, boxes[i]) | |||
| _ov = Recognizer.overlapped_area(boxes[i], box) | |||
| if (ov, _ov) < (max_overlapped, _max_overlapped): | |||
| continue | |||
| max_overlapped_i = i | |||
| max_overlapped = ov | |||
| _max_overlapped = _ov | |||
| return max_overlapped_i | |||
| def preprocess(self, image_list): | |||
| inputs = [] | |||
| if "scale_factor" in self.input_names: | |||
| preprocess_ops = [] | |||
| for op_info in [ | |||
| {'interp': 2, 'keep_ratio': False, 'target_size': [800, 608], 'type': 'LinearResize'}, | |||
| {'is_scale': True, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'type': 'StandardizeImage'}, | |||
| {'type': 'Permute'}, | |||
| {'stride': 32, 'type': 'PadStride'} | |||
| ]: | |||
| new_op_info = op_info.copy() | |||
| op_type = new_op_info.pop('type') | |||
| preprocess_ops.append(eval(op_type)(**new_op_info)) | |||
| for im_path in image_list: | |||
| im, im_info = preprocess(im_path, preprocess_ops) | |||
| inputs.append({"image": np.array((im,)).astype('float32'), | |||
| "scale_factor": np.array((im_info["scale_factor"],)).astype('float32')}) | |||
| else: | |||
| hh, ww = self.input_shape | |||
| for img in image_list: | |||
| h, w = img.shape[:2] | |||
| img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |||
| img = cv2.resize(np.array(img).astype('float32'), (ww, hh)) | |||
| # Scale input pixel values to 0 to 1 | |||
| img /= 255.0 | |||
| img = img.transpose(2, 0, 1) | |||
| img = img[np.newaxis, :, :, :].astype(np.float32) | |||
| inputs.append({self.input_names[0]: img, "scale_factor": [w/ww, h/hh]}) | |||
| return inputs | |||
| def postprocess(self, boxes, inputs, thr): | |||
| if "scale_factor" in self.input_names: | |||
| bb = [] | |||
| for b in boxes: | |||
| clsid, bbox, score = int(b[0]), b[2:], b[1] | |||
| if score < thr: | |||
| continue | |||
| if clsid >= len(self.label_list): | |||
| continue | |||
| bb.append({ | |||
| "type": self.label_list[clsid].lower(), | |||
| "bbox": [float(t) for t in bbox.tolist()], | |||
| "score": float(score) | |||
| }) | |||
| return bb | |||
| def xywh2xyxy(x): | |||
| # [x, y, w, h] to [x1, y1, x2, y2] | |||
| y = np.copy(x) | |||
| y[:, 0] = x[:, 0] - x[:, 2] / 2 | |||
| y[:, 1] = x[:, 1] - x[:, 3] / 2 | |||
| y[:, 2] = x[:, 0] + x[:, 2] / 2 | |||
| y[:, 3] = x[:, 1] + x[:, 3] / 2 | |||
| return y | |||
| def compute_iou(box, boxes): | |||
| # Compute xmin, ymin, xmax, ymax for both boxes | |||
| xmin = np.maximum(box[0], boxes[:, 0]) | |||
| ymin = np.maximum(box[1], boxes[:, 1]) | |||
| xmax = np.minimum(box[2], boxes[:, 2]) | |||
| ymax = np.minimum(box[3], boxes[:, 3]) | |||
| # Compute intersection area | |||
| intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin) | |||
| # Compute union area | |||
| box_area = (box[2] - box[0]) * (box[3] - box[1]) | |||
| boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) | |||
| union_area = box_area + boxes_area - intersection_area | |||
| # Compute IoU | |||
| iou = intersection_area / union_area | |||
| return iou | |||
| def iou_filter(boxes, scores, iou_threshold): | |||
| sorted_indices = np.argsort(scores)[::-1] | |||
| keep_boxes = [] | |||
| while sorted_indices.size > 0: | |||
| # Pick the last box | |||
| box_id = sorted_indices[0] | |||
| keep_boxes.append(box_id) | |||
| # Compute IoU of the picked box with the rest | |||
| ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :]) | |||
| # Remove boxes with IoU over the threshold | |||
| keep_indices = np.where(ious < iou_threshold)[0] | |||
| # print(keep_indices.shape, sorted_indices.shape) | |||
| sorted_indices = sorted_indices[keep_indices + 1] | |||
| return keep_boxes | |||
| boxes = np.squeeze(boxes).T | |||
| # Filter out object confidence scores below threshold | |||
| scores = np.max(boxes[:, 4:], axis=1) | |||
| boxes = boxes[scores > thr, :] | |||
| scores = scores[scores > thr] | |||
| if len(boxes) == 0: return [] | |||
| # Get the class with the highest confidence | |||
| class_ids = np.argmax(boxes[:, 4:], axis=1) | |||
| boxes = boxes[:, :4] | |||
| input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]]) | |||
| boxes = np.multiply(boxes, input_shape, dtype=np.float32) | |||
| boxes = xywh2xyxy(boxes) | |||
| unique_class_ids = np.unique(class_ids) | |||
| indices = [] | |||
| for class_id in unique_class_ids: | |||
| class_indices = np.where(class_ids == class_id)[0] | |||
| class_boxes = boxes[class_indices, :] | |||
| class_scores = scores[class_indices] | |||
| class_keep_boxes = iou_filter(class_boxes, class_scores, 0.2) | |||
| indices.extend(class_indices[class_keep_boxes]) | |||
| return [{ | |||
| "type": self.label_list[class_ids[i]].lower(), | |||
| "bbox": [float(t) for t in boxes[i].tolist()], | |||
| "score": float(scores[i]) | |||
| } for i in indices] | |||
| def __call__(self, image_list, thr=0.7, batch_size=16): | |||
| res = [] | |||
| imgs = [] | |||
| for i in range(len(image_list)): | |||
| if not isinstance(image_list[i], np.ndarray): | |||
| imgs.append(np.array(image_list[i])) | |||
| else: imgs.append(image_list[i]) | |||
| batch_loop_cnt = math.ceil(float(len(imgs)) / batch_size) | |||
| for i in range(batch_loop_cnt): | |||
| start_index = i * batch_size | |||
| end_index = min((i + 1) * batch_size, len(imgs)) | |||
| batch_image_list = imgs[start_index:end_index] | |||
| inputs = self.preprocess(batch_image_list) | |||
| print("preprocess") | |||
| for ins in inputs: | |||
| bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr) | |||
| res.append(bb) | |||
| #seeit.save_results(image_list, res, self.label_list, threshold=thr) | |||
| return res | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| from copy import deepcopy | |||
| import onnxruntime as ort | |||
| from huggingface_hub import snapshot_download | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from .operators import * | |||
| class Recognizer(object): | |||
| def __init__(self, label_list, task_name, model_dir=None): | |||
| """ | |||
| If you have trouble downloading HuggingFace models, -_^ this might help!! | |||
| For Linux: | |||
| export HF_ENDPOINT=https://hf-mirror.com | |||
| For Windows: | |||
| Good luck | |||
| ^_- | |||
| """ | |||
| if not model_dir: | |||
| model_dir = os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc") | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| if not os.path.exists(model_file_path): | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc", | |||
| local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), | |||
| local_dir_use_symlinks=False) | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| else: | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| if not os.path.exists(model_file_path): | |||
| raise ValueError("not find model file path {}".format( | |||
| model_file_path)) | |||
| if False and ort.get_device() == "GPU": | |||
| options = ort.SessionOptions() | |||
| options.enable_cpu_mem_arena = False | |||
| self.ort_sess = ort.InferenceSession(model_file_path, options=options, providers=[('CUDAExecutionProvider')]) | |||
| else: | |||
| self.ort_sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider']) | |||
| self.input_names = [node.name for node in self.ort_sess.get_inputs()] | |||
| self.output_names = [node.name for node in self.ort_sess.get_outputs()] | |||
| self.input_shape = self.ort_sess.get_inputs()[0].shape[2:4] | |||
| self.label_list = label_list | |||
| @staticmethod | |||
| def sort_Y_firstly(arr, threashold): | |||
| # sort using y1 first and then x1 | |||
| arr = sorted(arr, key=lambda r: (r["top"], r["x0"])) | |||
| for i in range(len(arr) - 1): | |||
| for j in range(i, -1, -1): | |||
| # restore the order using th | |||
| if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \ | |||
| and arr[j + 1]["x0"] < arr[j]["x0"]: | |||
| tmp = deepcopy(arr[j]) | |||
| arr[j] = deepcopy(arr[j + 1]) | |||
| arr[j + 1] = deepcopy(tmp) | |||
| return arr | |||
| @staticmethod | |||
| def sort_X_firstly(arr, threashold, copy=True): | |||
| # sort using y1 first and then x1 | |||
| arr = sorted(arr, key=lambda r: (r["x0"], r["top"])) | |||
| for i in range(len(arr) - 1): | |||
| for j in range(i, -1, -1): | |||
| # restore the order using th | |||
| if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \ | |||
| and arr[j + 1]["top"] < arr[j]["top"]: | |||
| tmp = deepcopy(arr[j]) if copy else arr[j] | |||
| arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1] | |||
| arr[j + 1] = deepcopy(tmp) if copy else tmp | |||
| return arr | |||
| @staticmethod | |||
| def sort_C_firstly(arr, thr=0): | |||
| # sort using y1 first and then x1 | |||
| # sorted(arr, key=lambda r: (r["x0"], r["top"])) | |||
| arr = Recognizer.sort_X_firstly(arr, thr) | |||
| for i in range(len(arr) - 1): | |||
| for j in range(i, -1, -1): | |||
| # restore the order using th | |||
| if "C" not in arr[j] or "C" not in arr[j + 1]: | |||
| continue | |||
| if arr[j + 1]["C"] < arr[j]["C"] \ | |||
| or ( | |||
| arr[j + 1]["C"] == arr[j]["C"] | |||
| and arr[j + 1]["top"] < arr[j]["top"] | |||
| ): | |||
| tmp = arr[j] | |||
| arr[j] = arr[j + 1] | |||
| arr[j + 1] = tmp | |||
| return arr | |||
| return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"])) | |||
| @staticmethod | |||
| def sort_R_firstly(arr, thr=0): | |||
| # sort using y1 first and then x1 | |||
| # sorted(arr, key=lambda r: (r["top"], r["x0"])) | |||
| arr = Recognizer.sort_Y_firstly(arr, thr) | |||
| for i in range(len(arr) - 1): | |||
| for j in range(i, -1, -1): | |||
| if "R" not in arr[j] or "R" not in arr[j + 1]: | |||
| continue | |||
| if arr[j + 1]["R"] < arr[j]["R"] \ | |||
| or ( | |||
| arr[j + 1]["R"] == arr[j]["R"] | |||
| and arr[j + 1]["x0"] < arr[j]["x0"] | |||
| ): | |||
| tmp = arr[j] | |||
| arr[j] = arr[j + 1] | |||
| arr[j + 1] = tmp | |||
| return arr | |||
| @staticmethod | |||
| def overlapped_area(a, b, ratio=True): | |||
| tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"] | |||
| if b["x0"] > x1 or b["x1"] < x0: | |||
| return 0 | |||
| if b["bottom"] < tp or b["top"] > btm: | |||
| return 0 | |||
| x0_ = max(b["x0"], x0) | |||
| x1_ = min(b["x1"], x1) | |||
| assert x0_ <= x1_, "Fuckedup! T:{},B:{},X0:{},X1:{} ==> {}".format( | |||
| tp, btm, x0, x1, b) | |||
| tp_ = max(b["top"], tp) | |||
| btm_ = min(b["bottom"], btm) | |||
| assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format( | |||
| tp, btm, x0, x1, b) | |||
| ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \ | |||
| x0 != 0 and btm - tp != 0 else 0 | |||
| if ov > 0 and ratio: | |||
| ov /= (x1 - x0) * (btm - tp) | |||
| return ov | |||
| @staticmethod | |||
| def layouts_cleanup(boxes, layouts, far=2, thr=0.7): | |||
| def notOverlapped(a, b): | |||
| return any([a["x1"] < b["x0"], | |||
| a["x0"] > b["x1"], | |||
| a["bottom"] < b["top"], | |||
| a["top"] > b["bottom"]]) | |||
| i = 0 | |||
| while i + 1 < len(layouts): | |||
| j = i + 1 | |||
| while j < min(i + far, len(layouts)) \ | |||
| and (layouts[i].get("type", "") != layouts[j].get("type", "") | |||
| or notOverlapped(layouts[i], layouts[j])): | |||
| j += 1 | |||
| if j >= min(i + far, len(layouts)): | |||
| i += 1 | |||
| continue | |||
| if Recognizer.overlapped_area(layouts[i], layouts[j]) < thr \ | |||
| and Recognizer.overlapped_area(layouts[j], layouts[i]) < thr: | |||
| i += 1 | |||
| continue | |||
| if layouts[i].get("score") and layouts[j].get("score"): | |||
| if layouts[i]["score"] > layouts[j]["score"]: | |||
| layouts.pop(j) | |||
| else: | |||
| layouts.pop(i) | |||
| continue | |||
| area_i, area_i_1 = 0, 0 | |||
| for b in boxes: | |||
| if not notOverlapped(b, layouts[i]): | |||
| area_i += Recognizer.overlapped_area(b, layouts[i], False) | |||
| if not notOverlapped(b, layouts[j]): | |||
| area_i_1 += Recognizer.overlapped_area(b, layouts[j], False) | |||
| if area_i > area_i_1: | |||
| layouts.pop(j) | |||
| else: | |||
| layouts.pop(i) | |||
| return layouts | |||
| def create_inputs(self, imgs, im_info): | |||
| """generate input for different model type | |||
| Args: | |||
| imgs (list(numpy)): list of images (np.ndarray) | |||
| im_info (list(dict)): list of image info | |||
| Returns: | |||
| inputs (dict): input of model | |||
| """ | |||
| inputs = {} | |||
| im_shape = [] | |||
| scale_factor = [] | |||
| if len(imgs) == 1: | |||
| inputs['image'] = np.array((imgs[0],)).astype('float32') | |||
| inputs['im_shape'] = np.array( | |||
| (im_info[0]['im_shape'],)).astype('float32') | |||
| inputs['scale_factor'] = np.array( | |||
| (im_info[0]['scale_factor'],)).astype('float32') | |||
| return inputs | |||
| for e in im_info: | |||
| im_shape.append(np.array((e['im_shape'],)).astype('float32')) | |||
| scale_factor.append(np.array((e['scale_factor'],)).astype('float32')) | |||
| inputs['im_shape'] = np.concatenate(im_shape, axis=0) | |||
| inputs['scale_factor'] = np.concatenate(scale_factor, axis=0) | |||
| imgs_shape = [[e.shape[1], e.shape[2]] for e in imgs] | |||
| max_shape_h = max([e[0] for e in imgs_shape]) | |||
| max_shape_w = max([e[1] for e in imgs_shape]) | |||
| padding_imgs = [] | |||
| for img in imgs: | |||
| im_c, im_h, im_w = img.shape[:] | |||
| padding_im = np.zeros( | |||
| (im_c, max_shape_h, max_shape_w), dtype=np.float32) | |||
| padding_im[:, :im_h, :im_w] = img | |||
| padding_imgs.append(padding_im) | |||
| inputs['image'] = np.stack(padding_imgs, axis=0) | |||
| return inputs | |||
| @staticmethod | |||
| def find_overlapped(box, boxes_sorted_by_y, naive=False): | |||
| if not boxes_sorted_by_y: | |||
| return | |||
| bxs = boxes_sorted_by_y | |||
| s, e, ii = 0, len(bxs), 0 | |||
| while s < e and not naive: | |||
| ii = (e + s) // 2 | |||
| pv = bxs[ii] | |||
| if box["bottom"] < pv["top"]: | |||
| e = ii | |||
| continue | |||
| if box["top"] > pv["bottom"]: | |||
| s = ii + 1 | |||
| continue | |||
| break | |||
| while s < ii: | |||
| if box["top"] > bxs[s]["bottom"]: | |||
| s += 1 | |||
| break | |||
| while e - 1 > ii: | |||
| if box["bottom"] < bxs[e - 1]["top"]: | |||
| e -= 1 | |||
| break | |||
| max_overlaped_i, max_overlaped = None, 0 | |||
| for i in range(s, e): | |||
| ov = Recognizer.overlapped_area(bxs[i], box) | |||
| if ov <= max_overlaped: | |||
| continue | |||
| max_overlaped_i = i | |||
| max_overlaped = ov | |||
| return max_overlaped_i | |||
| @staticmethod | |||
| def find_horizontally_tightest_fit(box, boxes): | |||
| if not boxes: | |||
| return | |||
| min_dis, min_i = 1000000, None | |||
| for i,b in enumerate(boxes): | |||
| if box.get("layoutno", "0") != b.get("layoutno", "0"): continue | |||
| dis = min(abs(box["x0"] - b["x0"]), abs(box["x1"] - b["x1"]), abs(box["x0"]+box["x1"] - b["x1"] - b["x0"])/2) | |||
| if dis < min_dis: | |||
| min_i = i | |||
| min_dis = dis | |||
| return min_i | |||
| @staticmethod | |||
| def find_overlapped_with_threashold(box, boxes, thr=0.3): | |||
| if not boxes: | |||
| return | |||
| max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 | |||
| s, e = 0, len(boxes) | |||
| for i in range(s, e): | |||
| ov = Recognizer.overlapped_area(box, boxes[i]) | |||
| _ov = Recognizer.overlapped_area(boxes[i], box) | |||
| if (ov, _ov) < (max_overlapped, _max_overlapped): | |||
| continue | |||
| max_overlapped_i = i | |||
| max_overlapped = ov | |||
| _max_overlapped = _ov | |||
| return max_overlapped_i | |||
| def preprocess(self, image_list): | |||
| inputs = [] | |||
| if "scale_factor" in self.input_names: | |||
| preprocess_ops = [] | |||
| for op_info in [ | |||
| {'interp': 2, 'keep_ratio': False, 'target_size': [800, 608], 'type': 'LinearResize'}, | |||
| {'is_scale': True, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'type': 'StandardizeImage'}, | |||
| {'type': 'Permute'}, | |||
| {'stride': 32, 'type': 'PadStride'} | |||
| ]: | |||
| new_op_info = op_info.copy() | |||
| op_type = new_op_info.pop('type') | |||
| preprocess_ops.append(eval(op_type)(**new_op_info)) | |||
| for im_path in image_list: | |||
| im, im_info = preprocess(im_path, preprocess_ops) | |||
| inputs.append({"image": np.array((im,)).astype('float32'), | |||
| "scale_factor": np.array((im_info["scale_factor"],)).astype('float32')}) | |||
| else: | |||
| hh, ww = self.input_shape | |||
| for img in image_list: | |||
| h, w = img.shape[:2] | |||
| img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |||
| img = cv2.resize(np.array(img).astype('float32'), (ww, hh)) | |||
| # Scale input pixel values to 0 to 1 | |||
| img /= 255.0 | |||
| img = img.transpose(2, 0, 1) | |||
| img = img[np.newaxis, :, :, :].astype(np.float32) | |||
| inputs.append({self.input_names[0]: img, "scale_factor": [w/ww, h/hh]}) | |||
| return inputs | |||
| def postprocess(self, boxes, inputs, thr): | |||
| if "scale_factor" in self.input_names: | |||
| bb = [] | |||
| for b in boxes: | |||
| clsid, bbox, score = int(b[0]), b[2:], b[1] | |||
| if score < thr: | |||
| continue | |||
| if clsid >= len(self.label_list): | |||
| continue | |||
| bb.append({ | |||
| "type": self.label_list[clsid].lower(), | |||
| "bbox": [float(t) for t in bbox.tolist()], | |||
| "score": float(score) | |||
| }) | |||
| return bb | |||
| def xywh2xyxy(x): | |||
| # [x, y, w, h] to [x1, y1, x2, y2] | |||
| y = np.copy(x) | |||
| y[:, 0] = x[:, 0] - x[:, 2] / 2 | |||
| y[:, 1] = x[:, 1] - x[:, 3] / 2 | |||
| y[:, 2] = x[:, 0] + x[:, 2] / 2 | |||
| y[:, 3] = x[:, 1] + x[:, 3] / 2 | |||
| return y | |||
| def compute_iou(box, boxes): | |||
| # Compute xmin, ymin, xmax, ymax for both boxes | |||
| xmin = np.maximum(box[0], boxes[:, 0]) | |||
| ymin = np.maximum(box[1], boxes[:, 1]) | |||
| xmax = np.minimum(box[2], boxes[:, 2]) | |||
| ymax = np.minimum(box[3], boxes[:, 3]) | |||
| # Compute intersection area | |||
| intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin) | |||
| # Compute union area | |||
| box_area = (box[2] - box[0]) * (box[3] - box[1]) | |||
| boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) | |||
| union_area = box_area + boxes_area - intersection_area | |||
| # Compute IoU | |||
| iou = intersection_area / union_area | |||
| return iou | |||
| def iou_filter(boxes, scores, iou_threshold): | |||
| sorted_indices = np.argsort(scores)[::-1] | |||
| keep_boxes = [] | |||
| while sorted_indices.size > 0: | |||
| # Pick the last box | |||
| box_id = sorted_indices[0] | |||
| keep_boxes.append(box_id) | |||
| # Compute IoU of the picked box with the rest | |||
| ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :]) | |||
| # Remove boxes with IoU over the threshold | |||
| keep_indices = np.where(ious < iou_threshold)[0] | |||
| # print(keep_indices.shape, sorted_indices.shape) | |||
| sorted_indices = sorted_indices[keep_indices + 1] | |||
| return keep_boxes | |||
| boxes = np.squeeze(boxes).T | |||
| # Filter out object confidence scores below threshold | |||
| scores = np.max(boxes[:, 4:], axis=1) | |||
| boxes = boxes[scores > thr, :] | |||
| scores = scores[scores > thr] | |||
| if len(boxes) == 0: return [] | |||
| # Get the class with the highest confidence | |||
| class_ids = np.argmax(boxes[:, 4:], axis=1) | |||
| boxes = boxes[:, :4] | |||
| input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]]) | |||
| boxes = np.multiply(boxes, input_shape, dtype=np.float32) | |||
| boxes = xywh2xyxy(boxes) | |||
| unique_class_ids = np.unique(class_ids) | |||
| indices = [] | |||
| for class_id in unique_class_ids: | |||
| class_indices = np.where(class_ids == class_id)[0] | |||
| class_boxes = boxes[class_indices, :] | |||
| class_scores = scores[class_indices] | |||
| class_keep_boxes = iou_filter(class_boxes, class_scores, 0.2) | |||
| indices.extend(class_indices[class_keep_boxes]) | |||
| return [{ | |||
| "type": self.label_list[class_ids[i]].lower(), | |||
| "bbox": [float(t) for t in boxes[i].tolist()], | |||
| "score": float(scores[i]) | |||
| } for i in indices] | |||
| def __call__(self, image_list, thr=0.7, batch_size=16): | |||
| res = [] | |||
| imgs = [] | |||
| for i in range(len(image_list)): | |||
| if not isinstance(image_list[i], np.ndarray): | |||
| imgs.append(np.array(image_list[i])) | |||
| else: imgs.append(image_list[i]) | |||
| batch_loop_cnt = math.ceil(float(len(imgs)) / batch_size) | |||
| for i in range(batch_loop_cnt): | |||
| start_index = i * batch_size | |||
| end_index = min((i + 1) * batch_size, len(imgs)) | |||
| batch_image_list = imgs[start_index:end_index] | |||
| inputs = self.preprocess(batch_image_list) | |||
| print("preprocess") | |||
| for ins in inputs: | |||
| bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr) | |||
| res.append(bb) | |||
| #seeit.save_results(image_list, res, self.label_list, threshold=thr) | |||
| return res | |||
| @@ -1,83 +1,83 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import PIL | |||
| from PIL import ImageDraw | |||
| def save_results(image_list, results, labels, output_dir='output/', threshold=0.5): | |||
| if not os.path.exists(output_dir): | |||
| os.makedirs(output_dir) | |||
| for idx, im in enumerate(image_list): | |||
| im = draw_box(im, results[idx], labels, threshold=threshold) | |||
| out_path = os.path.join(output_dir, f"{idx}.jpg") | |||
| im.save(out_path, quality=95) | |||
| print("save result to: " + out_path) | |||
| def draw_box(im, result, lables, threshold=0.5): | |||
| draw_thickness = min(im.size) // 320 | |||
| draw = ImageDraw.Draw(im) | |||
| color_list = get_color_map_list(len(lables)) | |||
| clsid2color = {n.lower():color_list[i] for i,n in enumerate(lables)} | |||
| result = [r for r in result if r["score"] >= threshold] | |||
| for dt in result: | |||
| color = tuple(clsid2color[dt["type"]]) | |||
| xmin, ymin, xmax, ymax = dt["bbox"] | |||
| draw.line( | |||
| [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), | |||
| (xmin, ymin)], | |||
| width=draw_thickness, | |||
| fill=color) | |||
| # draw label | |||
| text = "{} {:.4f}".format(dt["type"], dt["score"]) | |||
| tw, th = imagedraw_textsize_c(draw, text) | |||
| draw.rectangle( | |||
| [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) | |||
| draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) | |||
| return im | |||
| def get_color_map_list(num_classes): | |||
| """ | |||
| Args: | |||
| num_classes (int): number of class | |||
| Returns: | |||
| color_map (list): RGB color list | |||
| """ | |||
| color_map = num_classes * [0, 0, 0] | |||
| for i in range(0, num_classes): | |||
| j = 0 | |||
| lab = i | |||
| while lab: | |||
| color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) | |||
| color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) | |||
| color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) | |||
| j += 1 | |||
| lab >>= 3 | |||
| color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] | |||
| return color_map | |||
| def imagedraw_textsize_c(draw, text): | |||
| if int(PIL.__version__.split('.')[0]) < 10: | |||
| tw, th = draw.textsize(text) | |||
| else: | |||
| left, top, right, bottom = draw.textbbox((0, 0), text) | |||
| tw, th = right - left, bottom - top | |||
| return tw, th | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import PIL | |||
| from PIL import ImageDraw | |||
| def save_results(image_list, results, labels, output_dir='output/', threshold=0.5): | |||
| if not os.path.exists(output_dir): | |||
| os.makedirs(output_dir) | |||
| for idx, im in enumerate(image_list): | |||
| im = draw_box(im, results[idx], labels, threshold=threshold) | |||
| out_path = os.path.join(output_dir, f"{idx}.jpg") | |||
| im.save(out_path, quality=95) | |||
| print("save result to: " + out_path) | |||
| def draw_box(im, result, lables, threshold=0.5): | |||
| draw_thickness = min(im.size) // 320 | |||
| draw = ImageDraw.Draw(im) | |||
| color_list = get_color_map_list(len(lables)) | |||
| clsid2color = {n.lower():color_list[i] for i,n in enumerate(lables)} | |||
| result = [r for r in result if r["score"] >= threshold] | |||
| for dt in result: | |||
| color = tuple(clsid2color[dt["type"]]) | |||
| xmin, ymin, xmax, ymax = dt["bbox"] | |||
| draw.line( | |||
| [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), | |||
| (xmin, ymin)], | |||
| width=draw_thickness, | |||
| fill=color) | |||
| # draw label | |||
| text = "{} {:.4f}".format(dt["type"], dt["score"]) | |||
| tw, th = imagedraw_textsize_c(draw, text) | |||
| draw.rectangle( | |||
| [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) | |||
| draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) | |||
| return im | |||
| def get_color_map_list(num_classes): | |||
| """ | |||
| Args: | |||
| num_classes (int): number of class | |||
| Returns: | |||
| color_map (list): RGB color list | |||
| """ | |||
| color_map = num_classes * [0, 0, 0] | |||
| for i in range(0, num_classes): | |||
| j = 0 | |||
| lab = i | |||
| while lab: | |||
| color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) | |||
| color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) | |||
| color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) | |||
| j += 1 | |||
| lab >>= 3 | |||
| color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] | |||
| return color_map | |||
| def imagedraw_textsize_c(draw, text): | |||
| if int(PIL.__version__.split('.')[0]) < 10: | |||
| tw, th = draw.textsize(text) | |||
| else: | |||
| left, top, right, bottom = draw.textbbox((0, 0), text) | |||
| tw, th = right - left, bottom - top | |||
| return tw, th | |||
| @@ -1,56 +1,56 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import sys | |||
| sys.path.insert( | |||
| 0, | |||
| os.path.abspath( | |||
| os.path.join( | |||
| os.path.dirname( | |||
| os.path.abspath(__file__)), | |||
| '../../'))) | |||
| from deepdoc.vision.seeit import draw_box | |||
| from deepdoc.vision import OCR, init_in_out | |||
| import argparse | |||
| import numpy as np | |||
| def main(args): | |||
| ocr = OCR() | |||
| images, outputs = init_in_out(args) | |||
| for i, img in enumerate(images): | |||
| bxs = ocr(np.array(img)) | |||
| bxs = [(line[0], line[1][0]) for line in bxs] | |||
| bxs = [{ | |||
| "text": t, | |||
| "bbox": [b[0][0], b[0][1], b[1][0], b[-1][1]], | |||
| "type": "ocr", | |||
| "score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]] | |||
| img = draw_box(images[i], bxs, ["ocr"], 1.) | |||
| img.save(outputs[i], quality=95) | |||
| with open(outputs[i] + ".txt", "w+") as f: | |||
| f.write("\n".join([o["text"] for o in bxs])) | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument('--inputs', | |||
| help="Directory where to store images or PDFs, or a file path to a single image or PDF", | |||
| required=True) | |||
| parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './ocr_outputs'", | |||
| default="./ocr_outputs") | |||
| args = parser.parse_args() | |||
| main(args) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| import sys | |||
| sys.path.insert( | |||
| 0, | |||
| os.path.abspath( | |||
| os.path.join( | |||
| os.path.dirname( | |||
| os.path.abspath(__file__)), | |||
| '../../'))) | |||
| from deepdoc.vision.seeit import draw_box | |||
| from deepdoc.vision import OCR, init_in_out | |||
| import argparse | |||
| import numpy as np | |||
| def main(args): | |||
| ocr = OCR() | |||
| images, outputs = init_in_out(args) | |||
| for i, img in enumerate(images): | |||
| bxs = ocr(np.array(img)) | |||
| bxs = [(line[0], line[1][0]) for line in bxs] | |||
| bxs = [{ | |||
| "text": t, | |||
| "bbox": [b[0][0], b[0][1], b[1][0], b[-1][1]], | |||
| "type": "ocr", | |||
| "score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]] | |||
| img = draw_box(images[i], bxs, ["ocr"], 1.) | |||
| img.save(outputs[i], quality=95) | |||
| with open(outputs[i] + ".txt", "w+") as f: | |||
| f.write("\n".join([o["text"] for o in bxs])) | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument('--inputs', | |||
| help="Directory where to store images or PDFs, or a file path to a single image or PDF", | |||
| required=True) | |||
| parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './ocr_outputs'", | |||
| default="./ocr_outputs") | |||
| args = parser.parse_args() | |||
| main(args) | |||
| @@ -1,187 +1,187 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os, sys | |||
| sys.path.insert( | |||
| 0, | |||
| os.path.abspath( | |||
| os.path.join( | |||
| os.path.dirname( | |||
| os.path.abspath(__file__)), | |||
| '../../'))) | |||
| from deepdoc.vision.seeit import draw_box | |||
| from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out | |||
| from api.utils.file_utils import get_project_base_directory | |||
| import argparse | |||
| import re | |||
| import numpy as np | |||
| def main(args): | |||
| images, outputs = init_in_out(args) | |||
| if args.mode.lower() == "layout": | |||
| labels = LayoutRecognizer.labels | |||
| detr = Recognizer( | |||
| labels, | |||
| "layout", | |||
| os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc/")) | |||
| if args.mode.lower() == "tsr": | |||
| labels = TableStructureRecognizer.labels | |||
| detr = TableStructureRecognizer() | |||
| ocr = OCR() | |||
| layouts = detr(images, float(args.threshold)) | |||
| for i, lyt in enumerate(layouts): | |||
| if args.mode.lower() == "tsr": | |||
| #lyt = [t for t in lyt if t["type"] == "table column"] | |||
| html = get_table_html(images[i], lyt, ocr) | |||
| with open(outputs[i] + ".html", "w+") as f: | |||
| f.write(html) | |||
| lyt = [{ | |||
| "type": t["label"], | |||
| "bbox": [t["x0"], t["top"], t["x1"], t["bottom"]], | |||
| "score": t["score"] | |||
| } for t in lyt] | |||
| img = draw_box(images[i], lyt, labels, float(args.threshold)) | |||
| img.save(outputs[i], quality=95) | |||
| print("save result to: " + outputs[i]) | |||
| def get_table_html(img, tb_cpns, ocr): | |||
| boxes = ocr(np.array(img)) | |||
| boxes = Recognizer.sort_Y_firstly( | |||
| [{"x0": b[0][0], "x1": b[1][0], | |||
| "top": b[0][1], "text": t[0], | |||
| "bottom": b[-1][1], | |||
| "layout_type": "table", | |||
| "page_number": 0} for b, t in boxes if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]], | |||
| np.mean([b[-1][1] - b[0][1] for b, _ in boxes]) / 3 | |||
| ) | |||
| def gather(kwd, fzy=10, ption=0.6): | |||
| nonlocal boxes | |||
| eles = Recognizer.sort_Y_firstly( | |||
| [r for r in tb_cpns if re.match(kwd, r["label"])], fzy) | |||
| eles = Recognizer.layouts_cleanup(boxes, eles, 5, ption) | |||
| return Recognizer.sort_Y_firstly(eles, 0) | |||
| headers = gather(r".*header$") | |||
| rows = gather(r".* (row|header)") | |||
| spans = gather(r".*spanning") | |||
| clmns = sorted([r for r in tb_cpns if re.match( | |||
| r"table column$", r["label"])], key=lambda x: x["x0"]) | |||
| clmns = Recognizer.layouts_cleanup(boxes, clmns, 5, 0.5) | |||
| for b in boxes: | |||
| ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3) | |||
| if ii is not None: | |||
| b["R"] = ii | |||
| b["R_top"] = rows[ii]["top"] | |||
| b["R_bott"] = rows[ii]["bottom"] | |||
| ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3) | |||
| if ii is not None: | |||
| b["H_top"] = headers[ii]["top"] | |||
| b["H_bott"] = headers[ii]["bottom"] | |||
| b["H_left"] = headers[ii]["x0"] | |||
| b["H_right"] = headers[ii]["x1"] | |||
| b["H"] = ii | |||
| ii = Recognizer.find_horizontally_tightest_fit(b, clmns) | |||
| if ii is not None: | |||
| b["C"] = ii | |||
| b["C_left"] = clmns[ii]["x0"] | |||
| b["C_right"] = clmns[ii]["x1"] | |||
| ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3) | |||
| if ii is not None: | |||
| b["H_top"] = spans[ii]["top"] | |||
| b["H_bott"] = spans[ii]["bottom"] | |||
| b["H_left"] = spans[ii]["x0"] | |||
| b["H_right"] = spans[ii]["x1"] | |||
| b["SP"] = ii | |||
| html = """ | |||
| <html> | |||
| <head> | |||
| <style> | |||
| ._table_1nkzy_11 { | |||
| margin: auto; | |||
| width: 70%%; | |||
| padding: 10px; | |||
| } | |||
| ._table_1nkzy_11 p { | |||
| margin-bottom: 50px; | |||
| border: 1px solid #e1e1e1; | |||
| } | |||
| caption { | |||
| color: #6ac1ca; | |||
| font-size: 20px; | |||
| height: 50px; | |||
| line-height: 50px; | |||
| font-weight: 600; | |||
| margin-bottom: 10px; | |||
| } | |||
| ._table_1nkzy_11 table { | |||
| width: 100%%; | |||
| border-collapse: collapse; | |||
| } | |||
| th { | |||
| color: #fff; | |||
| background-color: #6ac1ca; | |||
| } | |||
| td:hover { | |||
| background: #c1e8e8; | |||
| } | |||
| tr:nth-child(even) { | |||
| background-color: #f2f2f2; | |||
| } | |||
| ._table_1nkzy_11 th, | |||
| ._table_1nkzy_11 td { | |||
| text-align: center; | |||
| border: 1px solid #ddd; | |||
| padding: 8px; | |||
| } | |||
| </style> | |||
| </head> | |||
| <body> | |||
| %s | |||
| </body> | |||
| </html> | |||
| """ % TableStructureRecognizer.construct_table(boxes, html=True) | |||
| return html | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument('--inputs', | |||
| help="Directory where to store images or PDFs, or a file path to a single image or PDF", | |||
| required=True) | |||
| parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './layouts_outputs'", | |||
| default="./layouts_outputs") | |||
| parser.add_argument( | |||
| '--threshold', | |||
| help="A threshold to filter out detections. Default: 0.5", | |||
| default=0.5) | |||
| parser.add_argument('--mode', help="Task mode: layout recognition or table structure recognition", choices=["layout", "tsr"], | |||
| default="layout") | |||
| args = parser.parse_args() | |||
| main(args) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os, sys | |||
| sys.path.insert( | |||
| 0, | |||
| os.path.abspath( | |||
| os.path.join( | |||
| os.path.dirname( | |||
| os.path.abspath(__file__)), | |||
| '../../'))) | |||
| from deepdoc.vision.seeit import draw_box | |||
| from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out | |||
| from api.utils.file_utils import get_project_base_directory | |||
| import argparse | |||
| import re | |||
| import numpy as np | |||
| def main(args): | |||
| images, outputs = init_in_out(args) | |||
| if args.mode.lower() == "layout": | |||
| labels = LayoutRecognizer.labels | |||
| detr = Recognizer( | |||
| labels, | |||
| "layout", | |||
| os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc/")) | |||
| if args.mode.lower() == "tsr": | |||
| labels = TableStructureRecognizer.labels | |||
| detr = TableStructureRecognizer() | |||
| ocr = OCR() | |||
| layouts = detr(images, float(args.threshold)) | |||
| for i, lyt in enumerate(layouts): | |||
| if args.mode.lower() == "tsr": | |||
| #lyt = [t for t in lyt if t["type"] == "table column"] | |||
| html = get_table_html(images[i], lyt, ocr) | |||
| with open(outputs[i] + ".html", "w+") as f: | |||
| f.write(html) | |||
| lyt = [{ | |||
| "type": t["label"], | |||
| "bbox": [t["x0"], t["top"], t["x1"], t["bottom"]], | |||
| "score": t["score"] | |||
| } for t in lyt] | |||
| img = draw_box(images[i], lyt, labels, float(args.threshold)) | |||
| img.save(outputs[i], quality=95) | |||
| print("save result to: " + outputs[i]) | |||
| def get_table_html(img, tb_cpns, ocr): | |||
| boxes = ocr(np.array(img)) | |||
| boxes = Recognizer.sort_Y_firstly( | |||
| [{"x0": b[0][0], "x1": b[1][0], | |||
| "top": b[0][1], "text": t[0], | |||
| "bottom": b[-1][1], | |||
| "layout_type": "table", | |||
| "page_number": 0} for b, t in boxes if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]], | |||
| np.mean([b[-1][1] - b[0][1] for b, _ in boxes]) / 3 | |||
| ) | |||
| def gather(kwd, fzy=10, ption=0.6): | |||
| nonlocal boxes | |||
| eles = Recognizer.sort_Y_firstly( | |||
| [r for r in tb_cpns if re.match(kwd, r["label"])], fzy) | |||
| eles = Recognizer.layouts_cleanup(boxes, eles, 5, ption) | |||
| return Recognizer.sort_Y_firstly(eles, 0) | |||
| headers = gather(r".*header$") | |||
| rows = gather(r".* (row|header)") | |||
| spans = gather(r".*spanning") | |||
| clmns = sorted([r for r in tb_cpns if re.match( | |||
| r"table column$", r["label"])], key=lambda x: x["x0"]) | |||
| clmns = Recognizer.layouts_cleanup(boxes, clmns, 5, 0.5) | |||
| for b in boxes: | |||
| ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3) | |||
| if ii is not None: | |||
| b["R"] = ii | |||
| b["R_top"] = rows[ii]["top"] | |||
| b["R_bott"] = rows[ii]["bottom"] | |||
| ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3) | |||
| if ii is not None: | |||
| b["H_top"] = headers[ii]["top"] | |||
| b["H_bott"] = headers[ii]["bottom"] | |||
| b["H_left"] = headers[ii]["x0"] | |||
| b["H_right"] = headers[ii]["x1"] | |||
| b["H"] = ii | |||
| ii = Recognizer.find_horizontally_tightest_fit(b, clmns) | |||
| if ii is not None: | |||
| b["C"] = ii | |||
| b["C_left"] = clmns[ii]["x0"] | |||
| b["C_right"] = clmns[ii]["x1"] | |||
| ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3) | |||
| if ii is not None: | |||
| b["H_top"] = spans[ii]["top"] | |||
| b["H_bott"] = spans[ii]["bottom"] | |||
| b["H_left"] = spans[ii]["x0"] | |||
| b["H_right"] = spans[ii]["x1"] | |||
| b["SP"] = ii | |||
| html = """ | |||
| <html> | |||
| <head> | |||
| <style> | |||
| ._table_1nkzy_11 { | |||
| margin: auto; | |||
| width: 70%%; | |||
| padding: 10px; | |||
| } | |||
| ._table_1nkzy_11 p { | |||
| margin-bottom: 50px; | |||
| border: 1px solid #e1e1e1; | |||
| } | |||
| caption { | |||
| color: #6ac1ca; | |||
| font-size: 20px; | |||
| height: 50px; | |||
| line-height: 50px; | |||
| font-weight: 600; | |||
| margin-bottom: 10px; | |||
| } | |||
| ._table_1nkzy_11 table { | |||
| width: 100%%; | |||
| border-collapse: collapse; | |||
| } | |||
| th { | |||
| color: #fff; | |||
| background-color: #6ac1ca; | |||
| } | |||
| td:hover { | |||
| background: #c1e8e8; | |||
| } | |||
| tr:nth-child(even) { | |||
| background-color: #f2f2f2; | |||
| } | |||
| ._table_1nkzy_11 th, | |||
| ._table_1nkzy_11 td { | |||
| text-align: center; | |||
| border: 1px solid #ddd; | |||
| padding: 8px; | |||
| } | |||
| </style> | |||
| </head> | |||
| <body> | |||
| %s | |||
| </body> | |||
| </html> | |||
| """ % TableStructureRecognizer.construct_table(boxes, html=True) | |||
| return html | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument('--inputs', | |||
| help="Directory where to store images or PDFs, or a file path to a single image or PDF", | |||
| required=True) | |||
| parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './layouts_outputs'", | |||
| default="./layouts_outputs") | |||
| parser.add_argument( | |||
| '--threshold', | |||
| help="A threshold to filter out detections. Default: 0.5", | |||
| default=0.5) | |||
| parser.add_argument('--mode', help="Task mode: layout recognition or table structure recognition", choices=["layout", "tsr"], | |||
| default="layout") | |||
| args = parser.parse_args() | |||
| main(args) | |||
| @@ -1,80 +1,80 @@ | |||
| # Docker Environment Variable | |||
| Look into [.env](./.env), there're some important variables. | |||
| ## MYSQL_PASSWORD | |||
| The mysql password could be changed by this variable. But you need to change *mysql.password* in [service_conf.yaml](./service_conf.yaml) at the same time. | |||
| ## MYSQL_PORT | |||
| It refers to exported port number of mysql docker container, it's useful if you want to access the database outside the docker containers. | |||
| ## MINIO_USER | |||
| It refers to user name of [Mino](https://github.com/minio/minio). The modification should be synchronous updating at minio.user of [service_conf.yaml](./service_conf.yaml). | |||
| ## MINIO_PASSWORD | |||
| It refers to user password of [Mino](https://github.com/minio/minio). The modification should be synchronous updating at minio.password of [service_conf.yaml](./service_conf.yaml). | |||
| ## SVR_HTTP_PORT | |||
| It refers to The API server serving port. | |||
| # Service Configuration | |||
| [service_conf.yaml](./service_conf.yaml) is used by the *API server* and *task executor*. It's the most important configuration of the system. | |||
| ## ragflow | |||
| ### host | |||
| The IP address used by the API server. | |||
| ### port | |||
| The serving port of API server. | |||
| ## mysql | |||
| ### name | |||
| The database name in mysql used by this system. | |||
| ### user | |||
| The database user name. | |||
| ### password | |||
| The database password. The modification should be synchronous updating at *MYSQL_PASSWORD* in [.env](./.env). | |||
| ### port | |||
| The serving port of mysql inside the container. The modification should be synchronous updating at [docker-compose.yml](./docker-compose.yml) | |||
| ### max_connections | |||
| The max database connection. | |||
| ### stale_timeout | |||
| The timeout duration in seconds. | |||
| ## minio | |||
| ### user | |||
| The username of minio. The modification should be synchronous updating at *MINIO_USER* in [.env](./.env). | |||
| ### password | |||
| The password of minio. The modification should be synchronous updating at *MINIO_PASSWORD* in [.env](./.env). | |||
| ### host | |||
| The serving IP and port inside the docker container. This is not updating until changing the minio part in [docker-compose.yml](./docker-compose.yml) | |||
| ## user_default_llm | |||
| Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*. | |||
| ### factory | |||
| The LLM suppliers. "OpenAI", "Tongyi-Qianwen", "ZHIPU-AI", "Moonshot", "DeepSeek", "Baichuan", and "VolcEngine" are supported. | |||
| ### api_key | |||
| The corresponding API key of your assigned LLM vendor. | |||
| ## oauth | |||
| This is OAuth configuration which allows your system using the third-party account to sign-up and sign-in to the system. | |||
| ### github | |||
| Got to [Github](https://github.com/settings/developers), register new application, the *client_id* and *secret_key* will be given. | |||
| # Docker Environment Variable | |||
| Look into [.env](./.env), there're some important variables. | |||
| ## MYSQL_PASSWORD | |||
| The mysql password could be changed by this variable. But you need to change *mysql.password* in [service_conf.yaml](./service_conf.yaml) at the same time. | |||
| ## MYSQL_PORT | |||
| It refers to exported port number of mysql docker container, it's useful if you want to access the database outside the docker containers. | |||
| ## MINIO_USER | |||
| It refers to user name of [Mino](https://github.com/minio/minio). The modification should be synchronous updating at minio.user of [service_conf.yaml](./service_conf.yaml). | |||
| ## MINIO_PASSWORD | |||
| It refers to user password of [Mino](https://github.com/minio/minio). The modification should be synchronous updating at minio.password of [service_conf.yaml](./service_conf.yaml). | |||
| ## SVR_HTTP_PORT | |||
| It refers to The API server serving port. | |||
| # Service Configuration | |||
| [service_conf.yaml](./service_conf.yaml) is used by the *API server* and *task executor*. It's the most important configuration of the system. | |||
| ## ragflow | |||
| ### host | |||
| The IP address used by the API server. | |||
| ### port | |||
| The serving port of API server. | |||
| ## mysql | |||
| ### name | |||
| The database name in mysql used by this system. | |||
| ### user | |||
| The database user name. | |||
| ### password | |||
| The database password. The modification should be synchronous updating at *MYSQL_PASSWORD* in [.env](./.env). | |||
| ### port | |||
| The serving port of mysql inside the container. The modification should be synchronous updating at [docker-compose.yml](./docker-compose.yml) | |||
| ### max_connections | |||
| The max database connection. | |||
| ### stale_timeout | |||
| The timeout duration in seconds. | |||
| ## minio | |||
| ### user | |||
| The username of minio. The modification should be synchronous updating at *MINIO_USER* in [.env](./.env). | |||
| ### password | |||
| The password of minio. The modification should be synchronous updating at *MINIO_PASSWORD* in [.env](./.env). | |||
| ### host | |||
| The serving IP and port inside the docker container. This is not updating until changing the minio part in [docker-compose.yml](./docker-compose.yml) | |||
| ## user_default_llm | |||
| Newly signed-up users use LLM configured by this part. Otherwise, user need to configure his own LLM in *setting*. | |||
| ### factory | |||
| The LLM suppliers. "OpenAI", "Tongyi-Qianwen", "ZHIPU-AI", "Moonshot", "DeepSeek", "Baichuan", and "VolcEngine" are supported. | |||
| ### api_key | |||
| The corresponding API key of your assigned LLM vendor. | |||
| ## oauth | |||
| This is OAuth configuration which allows your system using the third-party account to sign-up and sign-in to the system. | |||
| ### github | |||
| Got to [Github](https://github.com/settings/developers), register new application, the *client_id* and *secret_key* will be given. | |||
| @@ -1,37 +1,37 @@ | |||
| include: | |||
| - path: ./docker-compose-base.yml | |||
| env_file: ./.env | |||
| services: | |||
| ragflow: | |||
| depends_on: | |||
| mysql: | |||
| condition: service_healthy | |||
| es01: | |||
| condition: service_healthy | |||
| image: edwardelric233/ragflow:oc9 | |||
| container_name: ragflow-server | |||
| ports: | |||
| - ${SVR_HTTP_PORT}:9380 | |||
| - 80:80 | |||
| - 443:443 | |||
| volumes: | |||
| - ./service_conf.yaml:/ragflow/conf/service_conf.yaml | |||
| - ./ragflow-logs:/ragflow/logs | |||
| - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf | |||
| - ./nginx/proxy.conf:/etc/nginx/proxy.conf | |||
| - ./nginx/nginx.conf:/etc/nginx/nginx.conf | |||
| environment: | |||
| - TZ=${TIMEZONE} | |||
| - HF_ENDPOINT=https://hf-mirror.com | |||
| - MACOS=${MACOS} | |||
| networks: | |||
| - ragflow | |||
| restart: always | |||
| deploy: | |||
| resources: | |||
| reservations: | |||
| devices: | |||
| - driver: nvidia | |||
| count: all | |||
| capabilities: [gpu] | |||
| include: | |||
| - path: ./docker-compose-base.yml | |||
| env_file: ./.env | |||
| services: | |||
| ragflow: | |||
| depends_on: | |||
| mysql: | |||
| condition: service_healthy | |||
| es01: | |||
| condition: service_healthy | |||
| image: edwardelric233/ragflow:oc9 | |||
| container_name: ragflow-server | |||
| ports: | |||
| - ${SVR_HTTP_PORT}:9380 | |||
| - 80:80 | |||
| - 443:443 | |||
| volumes: | |||
| - ./service_conf.yaml:/ragflow/conf/service_conf.yaml | |||
| - ./ragflow-logs:/ragflow/logs | |||
| - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf | |||
| - ./nginx/proxy.conf:/etc/nginx/proxy.conf | |||
| - ./nginx/nginx.conf:/etc/nginx/nginx.conf | |||
| environment: | |||
| - TZ=${TIMEZONE} | |||
| - HF_ENDPOINT=https://hf-mirror.com | |||
| - MACOS=${MACOS} | |||
| networks: | |||
| - ragflow | |||
| restart: always | |||
| deploy: | |||
| resources: | |||
| reservations: | |||
| devices: | |||
| - driver: nvidia | |||
| count: all | |||
| capabilities: [gpu] | |||
| @@ -1,37 +1,37 @@ | |||
| include: | |||
| - path: ./docker-compose-base.yml | |||
| env_file: ./.env | |||
| services: | |||
| ragflow: | |||
| depends_on: | |||
| mysql: | |||
| condition: service_healthy | |||
| es01: | |||
| condition: service_healthy | |||
| image: swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow:${RAGFLOW_VERSION} | |||
| container_name: ragflow-server | |||
| ports: | |||
| - ${SVR_HTTP_PORT}:9380 | |||
| - 80:80 | |||
| - 443:443 | |||
| volumes: | |||
| - ./service_conf.yaml:/ragflow/conf/service_conf.yaml | |||
| - ./ragflow-logs:/ragflow/logs | |||
| - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf | |||
| - ./nginx/proxy.conf:/etc/nginx/proxy.conf | |||
| - ./nginx/nginx.conf:/etc/nginx/nginx.conf | |||
| environment: | |||
| - TZ=${TIMEZONE} | |||
| - HF_ENDPOINT=https://hf-mirror.com | |||
| - MACOS=${MACOS} | |||
| networks: | |||
| - ragflow | |||
| restart: always | |||
| deploy: | |||
| resources: | |||
| reservations: | |||
| devices: | |||
| - driver: nvidia | |||
| count: all | |||
| capabilities: [gpu] | |||
| include: | |||
| - path: ./docker-compose-base.yml | |||
| env_file: ./.env | |||
| services: | |||
| ragflow: | |||
| depends_on: | |||
| mysql: | |||
| condition: service_healthy | |||
| es01: | |||
| condition: service_healthy | |||
| image: swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow:${RAGFLOW_VERSION} | |||
| container_name: ragflow-server | |||
| ports: | |||
| - ${SVR_HTTP_PORT}:9380 | |||
| - 80:80 | |||
| - 443:443 | |||
| volumes: | |||
| - ./service_conf.yaml:/ragflow/conf/service_conf.yaml | |||
| - ./ragflow-logs:/ragflow/logs | |||
| - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf | |||
| - ./nginx/proxy.conf:/etc/nginx/proxy.conf | |||
| - ./nginx/nginx.conf:/etc/nginx/nginx.conf | |||
| environment: | |||
| - TZ=${TIMEZONE} | |||
| - HF_ENDPOINT=https://hf-mirror.com | |||
| - MACOS=${MACOS} | |||
| networks: | |||
| - ragflow | |||
| restart: always | |||
| deploy: | |||
| resources: | |||
| reservations: | |||
| devices: | |||
| - driver: nvidia | |||
| count: all | |||
| capabilities: [gpu] | |||
| @@ -1,2 +1,2 @@ | |||
| CREATE DATABASE IF NOT EXISTS rag_flow; | |||
| CREATE DATABASE IF NOT EXISTS rag_flow; | |||
| USE rag_flow; | |||
| @@ -1,33 +1,33 @@ | |||
| user root; | |||
| worker_processes auto; | |||
| error_log /var/log/nginx/error.log notice; | |||
| pid /var/run/nginx.pid; | |||
| events { | |||
| worker_connections 1024; | |||
| } | |||
| http { | |||
| include /etc/nginx/mime.types; | |||
| default_type application/octet-stream; | |||
| log_format main '$remote_addr - $remote_user [$time_local] "$request" ' | |||
| '$status $body_bytes_sent "$http_referer" ' | |||
| '"$http_user_agent" "$http_x_forwarded_for"'; | |||
| access_log /var/log/nginx/access.log main; | |||
| sendfile on; | |||
| #tcp_nopush on; | |||
| keepalive_timeout 65; | |||
| #gzip on; | |||
| client_max_body_size 128M; | |||
| include /etc/nginx/conf.d/ragflow.conf; | |||
| } | |||
| user root; | |||
| worker_processes auto; | |||
| error_log /var/log/nginx/error.log notice; | |||
| pid /var/run/nginx.pid; | |||
| events { | |||
| worker_connections 1024; | |||
| } | |||
| http { | |||
| include /etc/nginx/mime.types; | |||
| default_type application/octet-stream; | |||
| log_format main '$remote_addr - $remote_user [$time_local] "$request" ' | |||
| '$status $body_bytes_sent "$http_referer" ' | |||
| '"$http_user_agent" "$http_x_forwarded_for"'; | |||
| access_log /var/log/nginx/access.log main; | |||
| sendfile on; | |||
| #tcp_nopush on; | |||
| keepalive_timeout 65; | |||
| #gzip on; | |||
| client_max_body_size 128M; | |||
| include /etc/nginx/conf.d/ragflow.conf; | |||
| } | |||
| @@ -1,8 +1,8 @@ | |||
| proxy_set_header Host $host; | |||
| proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; | |||
| proxy_set_header X-Forwarded-Proto $scheme; | |||
| proxy_http_version 1.1; | |||
| proxy_set_header Connection ""; | |||
| proxy_buffering off; | |||
| proxy_read_timeout 3600s; | |||
| proxy_send_timeout 3600s; | |||
| proxy_set_header Host $host; | |||
| proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; | |||
| proxy_set_header X-Forwarded-Proto $scheme; | |||
| proxy_http_version 1.1; | |||
| proxy_set_header Connection ""; | |||
| proxy_buffering off; | |||
| proxy_read_timeout 3600s; | |||
| proxy_send_timeout 3600s; | |||
| @@ -1,28 +1,28 @@ | |||
| server { | |||
| listen 80; | |||
| server_name _; | |||
| root /ragflow/web/dist; | |||
| gzip on; | |||
| gzip_min_length 1k; | |||
| gzip_comp_level 9; | |||
| gzip_types text/plain application/javascript application/x-javascript text/css application/xml text/javascript application/x-httpd-php image/jpeg image/gif image/png; | |||
| gzip_vary on; | |||
| gzip_disable "MSIE [1-6]\."; | |||
| location /v1 { | |||
| proxy_pass http://ragflow:9380; | |||
| include proxy.conf; | |||
| } | |||
| location / { | |||
| index index.html; | |||
| try_files $uri $uri/ /index.html; | |||
| } | |||
| # Cache-Control: max-age~@~AExpires | |||
| location ~ ^/static/(css|js|media)/ { | |||
| expires 10y; | |||
| access_log off; | |||
| } | |||
| } | |||
| server { | |||
| listen 80; | |||
| server_name _; | |||
| root /ragflow/web/dist; | |||
| gzip on; | |||
| gzip_min_length 1k; | |||
| gzip_comp_level 9; | |||
| gzip_types text/plain application/javascript application/x-javascript text/css application/xml text/javascript application/x-httpd-php image/jpeg image/gif image/png; | |||
| gzip_vary on; | |||
| gzip_disable "MSIE [1-6]\."; | |||
| location /v1 { | |||
| proxy_pass http://ragflow:9380; | |||
| include proxy.conf; | |||
| } | |||
| location / { | |||
| index index.html; | |||
| try_files $uri $uri/ /index.html; | |||
| } | |||
| # Cache-Control: max-age~@~AExpires | |||
| location ~ ^/static/(css|js|media)/ { | |||
| expires 10y; | |||
| access_log off; | |||
| } | |||
| } | |||
| @@ -1,43 +1,43 @@ | |||
| ragflow: | |||
| host: 0.0.0.0 | |||
| http_port: 9380 | |||
| mysql: | |||
| name: 'rag_flow' | |||
| user: 'root' | |||
| password: 'infini_rag_flow' | |||
| host: 'mysql' | |||
| port: 3306 | |||
| max_connections: 100 | |||
| stale_timeout: 30 | |||
| minio: | |||
| user: 'rag_flow' | |||
| password: 'infini_rag_flow' | |||
| host: 'minio:9000' | |||
| es: | |||
| hosts: 'http://es01:9200' | |||
| username: 'elastic' | |||
| password: 'infini_rag_flow' | |||
| redis: | |||
| db: 1 | |||
| password: 'infini_rag_flow' | |||
| host: 'redis:6379' | |||
| user_default_llm: | |||
| factory: 'Tongyi-Qianwen' | |||
| api_key: 'sk-xxxxxxxxxxxxx' | |||
| base_url: '' | |||
| oauth: | |||
| github: | |||
| client_id: xxxxxxxxxxxxxxxxxxxxxxxxx | |||
| secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |||
| url: https://github.com/login/oauth/access_token | |||
| authentication: | |||
| client: | |||
| switch: false | |||
| http_app_key: | |||
| http_secret_key: | |||
| site: | |||
| switch: false | |||
| permission: | |||
| switch: false | |||
| component: false | |||
| dataset: false | |||
| ragflow: | |||
| host: 0.0.0.0 | |||
| http_port: 9380 | |||
| mysql: | |||
| name: 'rag_flow' | |||
| user: 'root' | |||
| password: 'infini_rag_flow' | |||
| host: 'mysql' | |||
| port: 3306 | |||
| max_connections: 100 | |||
| stale_timeout: 30 | |||
| minio: | |||
| user: 'rag_flow' | |||
| password: 'infini_rag_flow' | |||
| host: 'minio:9000' | |||
| es: | |||
| hosts: 'http://es01:9200' | |||
| username: 'elastic' | |||
| password: 'infini_rag_flow' | |||
| redis: | |||
| db: 1 | |||
| password: 'infini_rag_flow' | |||
| host: 'redis:6379' | |||
| user_default_llm: | |||
| factory: 'Tongyi-Qianwen' | |||
| api_key: 'sk-xxxxxxxxxxxxx' | |||
| base_url: '' | |||
| oauth: | |||
| github: | |||
| client_id: xxxxxxxxxxxxxxxxxxxxxxxxx | |||
| secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |||
| url: https://github.com/login/oauth/access_token | |||
| authentication: | |||
| client: | |||
| switch: false | |||
| http_app_key: | |||
| http_secret_key: | |||
| site: | |||
| switch: false | |||
| permission: | |||
| switch: false | |||
| component: false | |||
| dataset: false | |||
| @@ -1,159 +1,159 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | |||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | |||
| tokenize_chunks, find_codec | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback) | |||
| callback(msg="OCR finished") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.67, "Layout analysis finished") | |||
| print("layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.68, "Table analysis finished") | |||
| self._text_merge() | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| self._naive_vertical_merge() | |||
| self._filter_forpages() | |||
| self._merge_with_same_bullet() | |||
| callback(0.75, "Text merging finished.") | |||
| callback(0.8, "Text extraction finished") | |||
| return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) | |||
| for b in self.boxes], tbls | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, txt. | |||
| Since a book is long and not all the parts are useful, if it's a PDF, | |||
| please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time. | |||
| """ | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| pdf_parser = None | |||
| sections, tbls = [], [] | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| doc_parser = DocxParser() | |||
| # TODO: table of contents need to be removed | |||
| sections, tbls = doc_parser( | |||
| binary if binary else filename, from_page=from_page, to_page=to_page) | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| tbls = [((None, lns), None) for lns in tbls] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainParser() | |||
| sections, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| sections = txt.split("\n") | |||
| sections = [(l, "") for l in sections if l] | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [(l, "") for l in sections if l] | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [(l, "") for l in sections if l] | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| make_colon_as_title(sections) | |||
| bull = bullets_category( | |||
| [t for t in random_choices([t for t, _ in sections], k=100)]) | |||
| if bull >= 0: | |||
| chunks = ["\n".join(ck) | |||
| for ck in hierarchical_merge(bull, sections, 5)] | |||
| else: | |||
| sections = [s.split("@") for s, _ in sections] | |||
| sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ] | |||
| chunks = naive_merge( | |||
| sections, kwargs.get( | |||
| "chunk_token_num", 256), kwargs.get( | |||
| "delimer", "\n。;!?")) | |||
| # is it English | |||
| # is_english(random_choices([t for t, _ in sections], k=218)) | |||
| eng = lang.lower() == "english" | |||
| res = tokenize_table(tbls, doc, eng) | |||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | |||
| return res | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | |||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | |||
| tokenize_chunks, find_codec | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback) | |||
| callback(msg="OCR finished") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.67, "Layout analysis finished") | |||
| print("layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.68, "Table analysis finished") | |||
| self._text_merge() | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| self._naive_vertical_merge() | |||
| self._filter_forpages() | |||
| self._merge_with_same_bullet() | |||
| callback(0.75, "Text merging finished.") | |||
| callback(0.8, "Text extraction finished") | |||
| return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) | |||
| for b in self.boxes], tbls | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, txt. | |||
| Since a book is long and not all the parts are useful, if it's a PDF, | |||
| please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time. | |||
| """ | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| pdf_parser = None | |||
| sections, tbls = [], [] | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| doc_parser = DocxParser() | |||
| # TODO: table of contents need to be removed | |||
| sections, tbls = doc_parser( | |||
| binary if binary else filename, from_page=from_page, to_page=to_page) | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| tbls = [((None, lns), None) for lns in tbls] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainParser() | |||
| sections, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| sections = txt.split("\n") | |||
| sections = [(l, "") for l in sections if l] | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [(l, "") for l in sections if l] | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [(l, "") for l in sections if l] | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| make_colon_as_title(sections) | |||
| bull = bullets_category( | |||
| [t for t in random_choices([t for t, _ in sections], k=100)]) | |||
| if bull >= 0: | |||
| chunks = ["\n".join(ck) | |||
| for ck in hierarchical_merge(bull, sections, 5)] | |||
| else: | |||
| sections = [s.split("@") for s, _ in sections] | |||
| sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ] | |||
| chunks = naive_merge( | |||
| sections, kwargs.get( | |||
| "chunk_token_num", 256), kwargs.get( | |||
| "delimer", "\n。;!?")) | |||
| # is it English | |||
| # is_english(random_choices([t for t, _ in sections], k=218)) | |||
| eng = lang.lower() == "english" | |||
| res = tokenize_table(tbls, doc, eng) | |||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | |||
| return res | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy) | |||
| @@ -1,220 +1,220 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| from docx import Document | |||
| from api.db import ParserType | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | |||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | |||
| from rag.settings import cron_logger | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def __clean(self, line): | |||
| line = re.sub(r"\u3000", " ", line).strip() | |||
| return line | |||
| def old_call(self, filename, binary=None, from_page=0, to_page=100000): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| lines = [] | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| lines.append(self.__clean(p.text)) | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| return [l for l in lines if l] | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| lines = [] | |||
| bull = bullets_category([p.text for p in self.doc.paragraphs]) | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| question_level, p_text = docx_question_level(p, bull) | |||
| if not p_text.strip("\n"):continue | |||
| lines.append((question_level, p_text)) | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| visit = [False for _ in range(len(lines))] | |||
| sections = [] | |||
| for s in range(len(lines)): | |||
| e = s + 1 | |||
| while e < len(lines): | |||
| if lines[e][0] <= lines[s][0]: | |||
| break | |||
| e += 1 | |||
| if e - s == 1 and visit[s]: continue | |||
| sec = [] | |||
| next_level = lines[s][0] + 1 | |||
| while not sec and next_level < 22: | |||
| for i in range(s+1, e): | |||
| if lines[i][0] != next_level: continue | |||
| sec.append(lines[i][1]) | |||
| visit[i] = True | |||
| next_level += 1 | |||
| sec.insert(0, lines[s][1]) | |||
| sections.append("\n".join(sec)) | |||
| return [l for l in sections if l] | |||
| def __str__(self) -> str: | |||
| return f''' | |||
| question:{self.question}, | |||
| answer:{self.answer}, | |||
| level:{self.level}, | |||
| childs:{self.childs} | |||
| ''' | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| self.model_speciess = ParserType.LAWS.value | |||
| super().__init__() | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.67, "Layout analysis finished") | |||
| cron_logger.info("layouts:".format( | |||
| (timer() - start) / (self.total_page + 0.1))) | |||
| self._naive_vertical_merge() | |||
| callback(0.8, "Text extraction finished") | |||
| return [(b["text"], self._line_tag(b, zoomin)) | |||
| for b in self.boxes], None | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, txt. | |||
| """ | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| pdf_parser = None | |||
| sections = [] | |||
| # is it English | |||
| eng = lang.lower() == "english" # is_english(sections) | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| for txt in Docx()(filename, binary): | |||
| sections.append(txt) | |||
| callback(0.8, "Finish parsing.") | |||
| chunks = sections | |||
| return tokenize_chunks(chunks, doc, eng, pdf_parser) | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainParser() | |||
| for txt, poss in pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback)[0]: | |||
| sections.append(txt + poss) | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| sections = txt.split("\n") | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| # Remove 'Contents' part | |||
| remove_contents_table(sections, eng) | |||
| make_colon_as_title(sections) | |||
| bull = bullets_category(sections) | |||
| chunks = hierarchical_merge(bull, sections, 5) | |||
| if not chunks: | |||
| callback(0.99, "No chunk parsed out.") | |||
| return tokenize_chunks(["\n".join(ck) | |||
| for ck in chunks], doc, eng, pdf_parser) | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| from docx import Document | |||
| from api.db import ParserType | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | |||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | |||
| from rag.settings import cron_logger | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def __clean(self, line): | |||
| line = re.sub(r"\u3000", " ", line).strip() | |||
| return line | |||
| def old_call(self, filename, binary=None, from_page=0, to_page=100000): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| lines = [] | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| lines.append(self.__clean(p.text)) | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| return [l for l in lines if l] | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| lines = [] | |||
| bull = bullets_category([p.text for p in self.doc.paragraphs]) | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| question_level, p_text = docx_question_level(p, bull) | |||
| if not p_text.strip("\n"):continue | |||
| lines.append((question_level, p_text)) | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| visit = [False for _ in range(len(lines))] | |||
| sections = [] | |||
| for s in range(len(lines)): | |||
| e = s + 1 | |||
| while e < len(lines): | |||
| if lines[e][0] <= lines[s][0]: | |||
| break | |||
| e += 1 | |||
| if e - s == 1 and visit[s]: continue | |||
| sec = [] | |||
| next_level = lines[s][0] + 1 | |||
| while not sec and next_level < 22: | |||
| for i in range(s+1, e): | |||
| if lines[i][0] != next_level: continue | |||
| sec.append(lines[i][1]) | |||
| visit[i] = True | |||
| next_level += 1 | |||
| sec.insert(0, lines[s][1]) | |||
| sections.append("\n".join(sec)) | |||
| return [l for l in sections if l] | |||
| def __str__(self) -> str: | |||
| return f''' | |||
| question:{self.question}, | |||
| answer:{self.answer}, | |||
| level:{self.level}, | |||
| childs:{self.childs} | |||
| ''' | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| self.model_speciess = ParserType.LAWS.value | |||
| super().__init__() | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.67, "Layout analysis finished") | |||
| cron_logger.info("layouts:".format( | |||
| (timer() - start) / (self.total_page + 0.1))) | |||
| self._naive_vertical_merge() | |||
| callback(0.8, "Text extraction finished") | |||
| return [(b["text"], self._line_tag(b, zoomin)) | |||
| for b in self.boxes], None | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, txt. | |||
| """ | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| pdf_parser = None | |||
| sections = [] | |||
| # is it English | |||
| eng = lang.lower() == "english" # is_english(sections) | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| for txt in Docx()(filename, binary): | |||
| sections.append(txt) | |||
| callback(0.8, "Finish parsing.") | |||
| chunks = sections | |||
| return tokenize_chunks(chunks, doc, eng, pdf_parser) | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainParser() | |||
| for txt, poss in pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback)[0]: | |||
| sections.append(txt + poss) | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| sections = txt.split("\n") | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| # Remove 'Contents' part | |||
| remove_contents_table(sections, eng) | |||
| make_colon_as_title(sections) | |||
| bull = bullets_category(sections) | |||
| chunks = hierarchical_merge(bull, sections, 5) | |||
| if not chunks: | |||
| callback(0.99, "No chunk parsed out.") | |||
| return tokenize_chunks(["\n".join(ck) | |||
| for ck in chunks], doc, eng, pdf_parser) | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| @@ -1,272 +1,272 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from api.db import ParserType | |||
| from io import BytesIO | |||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level | |||
| from deepdoc.parser import PdfParser, PlainParser | |||
| from rag.utils import num_tokens_from_string | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | |||
| from docx import Document | |||
| from PIL import Image | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| self.model_speciess = ParserType.MANUAL.value | |||
| super().__init__() | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished.") | |||
| # for bb in self.boxes: | |||
| # for b in bb: | |||
| # print(b) | |||
| print("OCR:", timer() - start) | |||
| self._layouts_rec(zoomin) | |||
| callback(0.65, "Layout analysis finished.") | |||
| print("layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.67, "Table analysis finished.") | |||
| self._text_merge() | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| self._concat_downward() | |||
| self._filter_forpages() | |||
| callback(0.68, "Text merging finished") | |||
| # clean mess | |||
| for b in self.boxes: | |||
| b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip()) | |||
| return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) | |||
| for i, b in enumerate(self.boxes)], tbls | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def get_picture(self, document, paragraph): | |||
| img = paragraph._element.xpath('.//pic:pic') | |||
| if not img: | |||
| return None | |||
| img = img[0] | |||
| embed = img.xpath('.//a:blip/@r:embed')[0] | |||
| related_part = document.part.related_parts[embed] | |||
| image = related_part.image | |||
| image = Image.open(BytesIO(image.blob)) | |||
| return image | |||
| def concat_img(self, img1, img2): | |||
| if img1 and not img2: | |||
| return img1 | |||
| if not img1 and img2: | |||
| return img2 | |||
| if not img1 and not img2: | |||
| return None | |||
| width1, height1 = img1.size | |||
| width2, height2 = img2.size | |||
| new_width = max(width1, width2) | |||
| new_height = height1 + height2 | |||
| new_image = Image.new('RGB', (new_width, new_height)) | |||
| new_image.paste(img1, (0, 0)) | |||
| new_image.paste(img2, (0, height1)) | |||
| return new_image | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| last_answer, last_image = "", None | |||
| question_stack, level_stack = [], [] | |||
| ti_list = [] | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| question_level, p_text = 0, '' | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| question_level, p_text = docx_question_level(p) | |||
| if not question_level or question_level > 6: # not a question | |||
| last_answer = f'{last_answer}\n{p_text}' | |||
| current_image = self.get_picture(self.doc, p) | |||
| last_image = self.concat_img(last_image, current_image) | |||
| else: # is a question | |||
| if last_answer or last_image: | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| ti_list.append((f'{sum_question}\n{last_answer}', last_image)) | |||
| last_answer, last_image = '', None | |||
| i = question_level | |||
| while question_stack and i <= level_stack[-1]: | |||
| question_stack.pop() | |||
| level_stack.pop() | |||
| question_stack.append(p_text) | |||
| level_stack.append(question_level) | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| if last_answer: | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| ti_list.append((f'{sum_question}\n{last_answer}', last_image)) | |||
| tbls = [] | |||
| for tb in self.doc.tables: | |||
| html= "<table>" | |||
| for r in tb.rows: | |||
| html += "<tr>" | |||
| i = 0 | |||
| while i < len(r.cells): | |||
| span = 1 | |||
| c = r.cells[i] | |||
| for j in range(i+1, len(r.cells)): | |||
| if c.text == r.cells[j].text: | |||
| span += 1 | |||
| i = j | |||
| i += 1 | |||
| html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>" | |||
| html += "</tr>" | |||
| html += "</table>" | |||
| tbls.append(((None, html), "")) | |||
| return ti_list, tbls | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Only pdf is supported. | |||
| """ | |||
| pdf_parser = None | |||
| doc = { | |||
| "docnm_kwd": filename | |||
| } | |||
| doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| # is it English | |||
| eng = lang.lower() == "english" # pdf_parser.is_english | |||
| if re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainParser() | |||
| sections, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| if sections and len(sections[0]) < 3: | |||
| sections = [(t, l, [[0] * 5]) for t, l in sections] | |||
| # set pivot using the most frequent type of title, | |||
| # then merge between 2 pivot | |||
| if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1: | |||
| max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) | |||
| most_level = max(0, max_lvl - 1) | |||
| levels = [] | |||
| for txt, _, _ in sections: | |||
| for t, lvl in pdf_parser.outlines: | |||
| tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)]) | |||
| tks_ = set([txt[i] + txt[i + 1] | |||
| for i in range(min(len(t), len(txt) - 1))]) | |||
| if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8: | |||
| levels.append(lvl) | |||
| break | |||
| else: | |||
| levels.append(max_lvl + 1) | |||
| else: | |||
| bull = bullets_category([txt for txt, _, _ in sections]) | |||
| most_level, levels = title_frequency( | |||
| bull, [(txt, l) for txt, l, poss in sections]) | |||
| assert len(sections) == len(levels) | |||
| sec_ids = [] | |||
| sid = 0 | |||
| for i, lvl in enumerate(levels): | |||
| if lvl <= most_level and i > 0 and lvl != levels[i - 1]: | |||
| sid += 1 | |||
| sec_ids.append(sid) | |||
| # print(lvl, self.boxes[i]["text"], most_level, sid) | |||
| sections = [(txt, sec_ids[i], poss) | |||
| for i, (txt, _, poss) in enumerate(sections)] | |||
| for (img, rows), poss in tbls: | |||
| if not rows: continue | |||
| sections.append((rows if isinstance(rows, str) else rows[0], -1, | |||
| [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) | |||
| def tag(pn, left, right, top, bottom): | |||
| if pn + left + right + top + bottom == 0: | |||
| return "" | |||
| return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ | |||
| .format(pn, left, right, top, bottom) | |||
| chunks = [] | |||
| last_sid = -2 | |||
| tk_cnt = 0 | |||
| for txt, sec_id, poss in sorted(sections, key=lambda x: ( | |||
| x[-1][0][0], x[-1][0][3], x[-1][0][1])): | |||
| poss = "\t".join([tag(*pos) for pos in poss]) | |||
| if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)): | |||
| if chunks: | |||
| chunks[-1] += "\n" + txt + poss | |||
| tk_cnt += num_tokens_from_string(txt) | |||
| continue | |||
| chunks.append(txt + poss) | |||
| tk_cnt = num_tokens_from_string(txt) | |||
| if sec_id > -1: | |||
| last_sid = sec_id | |||
| res = tokenize_table(tbls, doc, eng) | |||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | |||
| return res | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| docx_parser = Docx() | |||
| ti_list, tbls = docx_parser(filename, binary, | |||
| from_page=0, to_page=10000, callback=callback) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| for text, image in ti_list: | |||
| d = copy.deepcopy(doc) | |||
| d['image'] = image | |||
| tokenize(d, text, eng) | |||
| res.append(d) | |||
| return res | |||
| else: | |||
| raise NotImplementedError("file type not supported yet(pdf and docx supported)") | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from api.db import ParserType | |||
| from io import BytesIO | |||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level | |||
| from deepdoc.parser import PdfParser, PlainParser | |||
| from rag.utils import num_tokens_from_string | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | |||
| from docx import Document | |||
| from PIL import Image | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| self.model_speciess = ParserType.MANUAL.value | |||
| super().__init__() | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished.") | |||
| # for bb in self.boxes: | |||
| # for b in bb: | |||
| # print(b) | |||
| print("OCR:", timer() - start) | |||
| self._layouts_rec(zoomin) | |||
| callback(0.65, "Layout analysis finished.") | |||
| print("layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.67, "Table analysis finished.") | |||
| self._text_merge() | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| self._concat_downward() | |||
| self._filter_forpages() | |||
| callback(0.68, "Text merging finished") | |||
| # clean mess | |||
| for b in self.boxes: | |||
| b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip()) | |||
| return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) | |||
| for i, b in enumerate(self.boxes)], tbls | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def get_picture(self, document, paragraph): | |||
| img = paragraph._element.xpath('.//pic:pic') | |||
| if not img: | |||
| return None | |||
| img = img[0] | |||
| embed = img.xpath('.//a:blip/@r:embed')[0] | |||
| related_part = document.part.related_parts[embed] | |||
| image = related_part.image | |||
| image = Image.open(BytesIO(image.blob)) | |||
| return image | |||
| def concat_img(self, img1, img2): | |||
| if img1 and not img2: | |||
| return img1 | |||
| if not img1 and img2: | |||
| return img2 | |||
| if not img1 and not img2: | |||
| return None | |||
| width1, height1 = img1.size | |||
| width2, height2 = img2.size | |||
| new_width = max(width1, width2) | |||
| new_height = height1 + height2 | |||
| new_image = Image.new('RGB', (new_width, new_height)) | |||
| new_image.paste(img1, (0, 0)) | |||
| new_image.paste(img2, (0, height1)) | |||
| return new_image | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| last_answer, last_image = "", None | |||
| question_stack, level_stack = [], [] | |||
| ti_list = [] | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| question_level, p_text = 0, '' | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| question_level, p_text = docx_question_level(p) | |||
| if not question_level or question_level > 6: # not a question | |||
| last_answer = f'{last_answer}\n{p_text}' | |||
| current_image = self.get_picture(self.doc, p) | |||
| last_image = self.concat_img(last_image, current_image) | |||
| else: # is a question | |||
| if last_answer or last_image: | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| ti_list.append((f'{sum_question}\n{last_answer}', last_image)) | |||
| last_answer, last_image = '', None | |||
| i = question_level | |||
| while question_stack and i <= level_stack[-1]: | |||
| question_stack.pop() | |||
| level_stack.pop() | |||
| question_stack.append(p_text) | |||
| level_stack.append(question_level) | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| if last_answer: | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| ti_list.append((f'{sum_question}\n{last_answer}', last_image)) | |||
| tbls = [] | |||
| for tb in self.doc.tables: | |||
| html= "<table>" | |||
| for r in tb.rows: | |||
| html += "<tr>" | |||
| i = 0 | |||
| while i < len(r.cells): | |||
| span = 1 | |||
| c = r.cells[i] | |||
| for j in range(i+1, len(r.cells)): | |||
| if c.text == r.cells[j].text: | |||
| span += 1 | |||
| i = j | |||
| i += 1 | |||
| html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>" | |||
| html += "</tr>" | |||
| html += "</table>" | |||
| tbls.append(((None, html), "")) | |||
| return ti_list, tbls | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Only pdf is supported. | |||
| """ | |||
| pdf_parser = None | |||
| doc = { | |||
| "docnm_kwd": filename | |||
| } | |||
| doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| # is it English | |||
| eng = lang.lower() == "english" # pdf_parser.is_english | |||
| if re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainParser() | |||
| sections, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| if sections and len(sections[0]) < 3: | |||
| sections = [(t, l, [[0] * 5]) for t, l in sections] | |||
| # set pivot using the most frequent type of title, | |||
| # then merge between 2 pivot | |||
| if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1: | |||
| max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) | |||
| most_level = max(0, max_lvl - 1) | |||
| levels = [] | |||
| for txt, _, _ in sections: | |||
| for t, lvl in pdf_parser.outlines: | |||
| tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)]) | |||
| tks_ = set([txt[i] + txt[i + 1] | |||
| for i in range(min(len(t), len(txt) - 1))]) | |||
| if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8: | |||
| levels.append(lvl) | |||
| break | |||
| else: | |||
| levels.append(max_lvl + 1) | |||
| else: | |||
| bull = bullets_category([txt for txt, _, _ in sections]) | |||
| most_level, levels = title_frequency( | |||
| bull, [(txt, l) for txt, l, poss in sections]) | |||
| assert len(sections) == len(levels) | |||
| sec_ids = [] | |||
| sid = 0 | |||
| for i, lvl in enumerate(levels): | |||
| if lvl <= most_level and i > 0 and lvl != levels[i - 1]: | |||
| sid += 1 | |||
| sec_ids.append(sid) | |||
| # print(lvl, self.boxes[i]["text"], most_level, sid) | |||
| sections = [(txt, sec_ids[i], poss) | |||
| for i, (txt, _, poss) in enumerate(sections)] | |||
| for (img, rows), poss in tbls: | |||
| if not rows: continue | |||
| sections.append((rows if isinstance(rows, str) else rows[0], -1, | |||
| [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) | |||
| def tag(pn, left, right, top, bottom): | |||
| if pn + left + right + top + bottom == 0: | |||
| return "" | |||
| return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ | |||
| .format(pn, left, right, top, bottom) | |||
| chunks = [] | |||
| last_sid = -2 | |||
| tk_cnt = 0 | |||
| for txt, sec_id, poss in sorted(sections, key=lambda x: ( | |||
| x[-1][0][0], x[-1][0][3], x[-1][0][1])): | |||
| poss = "\t".join([tag(*pos) for pos in poss]) | |||
| if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)): | |||
| if chunks: | |||
| chunks[-1] += "\n" + txt + poss | |||
| tk_cnt += num_tokens_from_string(txt) | |||
| continue | |||
| chunks.append(txt + poss) | |||
| tk_cnt = num_tokens_from_string(txt) | |||
| if sec_id > -1: | |||
| last_sid = sec_id | |||
| res = tokenize_table(tbls, doc, eng) | |||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | |||
| return res | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| docx_parser = Docx() | |||
| ti_list, tbls = docx_parser(filename, binary, | |||
| from_page=0, to_page=10000, callback=callback) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| for text, image in ti_list: | |||
| d = copy.deepcopy(doc) | |||
| d['image'] = image | |||
| tokenize(d, text, eng) | |||
| res.append(d) | |||
| return res | |||
| else: | |||
| raise NotImplementedError("file type not supported yet(pdf and docx supported)") | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| @@ -1,282 +1,282 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from tika import parser | |||
| from io import BytesIO | |||
| from docx import Document | |||
| from timeit import default_timer as timer | |||
| import re | |||
| from deepdoc.parser.pdf_parser import PlainParser | |||
| from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser | |||
| from rag.settings import cron_logger | |||
| from rag.utils import num_tokens_from_string | |||
| from PIL import Image | |||
| from functools import reduce | |||
| from markdown import markdown | |||
| from docx.image.exceptions import UnrecognizedImageError | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def get_picture(self, document, paragraph): | |||
| img = paragraph._element.xpath('.//pic:pic') | |||
| if not img: | |||
| return None | |||
| img = img[0] | |||
| embed = img.xpath('.//a:blip/@r:embed')[0] | |||
| related_part = document.part.related_parts[embed] | |||
| try: | |||
| image_blob = related_part.image.blob | |||
| except UnrecognizedImageError: | |||
| print("Unrecognized image format. Skipping image.") | |||
| return None | |||
| try: | |||
| image = Image.open(BytesIO(image_blob)).convert('RGB') | |||
| return image | |||
| except Exception as e: | |||
| return None | |||
| def __clean(self, line): | |||
| line = re.sub(r"\u3000", " ", line).strip() | |||
| return line | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| lines = [] | |||
| last_image = None | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| if from_page <= pn < to_page: | |||
| if p.text.strip(): | |||
| if p.style and p.style.name == 'Caption': | |||
| former_image = None | |||
| if lines and lines[-1][1] and lines[-1][2] != 'Caption': | |||
| former_image = lines[-1][1].pop() | |||
| elif last_image: | |||
| former_image = last_image | |||
| last_image = None | |||
| lines.append((self.__clean(p.text), [former_image], p.style.name)) | |||
| else: | |||
| current_image = self.get_picture(self.doc, p) | |||
| image_list = [current_image] | |||
| if last_image: | |||
| image_list.insert(0, last_image) | |||
| last_image = None | |||
| lines.append((self.__clean(p.text), image_list, p.style.name)) | |||
| else: | |||
| if current_image := self.get_picture(self.doc, p): | |||
| if lines: | |||
| lines[-1][1].append(current_image) | |||
| else: | |||
| last_image = current_image | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines] | |||
| tbls = [] | |||
| for tb in self.doc.tables: | |||
| html= "<table>" | |||
| for r in tb.rows: | |||
| html += "<tr>" | |||
| i = 0 | |||
| while i < len(r.cells): | |||
| span = 1 | |||
| c = r.cells[i] | |||
| for j in range(i+1, len(r.cells)): | |||
| if c.text == r.cells[j].text: | |||
| span += 1 | |||
| i = j | |||
| i += 1 | |||
| html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>" | |||
| html += "</tr>" | |||
| html += "</table>" | |||
| tbls.append(((None, html), "")) | |||
| return new_line, tbls | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| start = timer() | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.63, "Layout analysis finished.") | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.65, "Table analysis finished.") | |||
| self._text_merge() | |||
| callback(0.67, "Text merging finished") | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| #self._naive_vertical_merge() | |||
| self._concat_downward() | |||
| #self._filter_forpages() | |||
| cron_logger.info("layouts: {}".format(timer() - start)) | |||
| return [(b["text"], self._line_tag(b, zoomin)) | |||
| for b in self.boxes], tbls | |||
| class Markdown(MarkdownParser): | |||
| def __call__(self, filename, binary=None): | |||
| txt = "" | |||
| tbls = [] | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| txt = f.read() | |||
| remainder, tables = self.extract_tables_and_remainder(f'{txt}\n') | |||
| sections = [] | |||
| tbls = [] | |||
| for sec in remainder.split("\n"): | |||
| if num_tokens_from_string(sec) > 10 * self.chunk_token_num: | |||
| sections.append((sec[:int(len(sec)/2)], "")) | |||
| sections.append((sec[int(len(sec)/2):], "")) | |||
| else: | |||
| sections.append((sec, "")) | |||
| print(tables) | |||
| for table in tables: | |||
| tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) | |||
| return sections, tbls | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, excel, txt. | |||
| This method apply the naive ways to chunk files. | |||
| Successive text will be sliced into pieces using 'delimiter'. | |||
| Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. | |||
| """ | |||
| eng = lang.lower() == "english" # is_english(cks) | |||
| parser_config = kwargs.get( | |||
| "parser_config", { | |||
| "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}) | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| res = [] | |||
| pdf_parser = None | |||
| sections = [] | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections, tbls = Docx()(filename, binary) | |||
| res = tokenize_table(tbls, doc, eng) # just for table | |||
| callback(0.8, "Finish parsing.") | |||
| st = timer() | |||
| chunks, images = naive_merge_docx( | |||
| sections, int(parser_config.get( | |||
| "chunk_token_num", 128)), parser_config.get( | |||
| "delimiter", "\n!?。;!?")) | |||
| if kwargs.get("section_only", False): | |||
| return chunks | |||
| res.extend(tokenize_chunks_docx(chunks, doc, eng, images)) | |||
| cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) | |||
| return res | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf( | |||
| ) if parser_config.get("layout_recognize", True) else PlainParser() | |||
| sections, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = ExcelParser() | |||
| sections = [(l, "") for l in excel_parser.html(binary) if l] | |||
| elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = TxtParser()(filename,binary, | |||
| parser_config.get("chunk_token_num", 128), | |||
| parser_config.get("delimiter", "\n!?;。;!?")) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.json$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary) | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") | |||
| st = timer() | |||
| chunks = naive_merge( | |||
| sections, int(parser_config.get( | |||
| "chunk_token_num", 128)), parser_config.get( | |||
| "delimiter", "\n!?。;!?")) | |||
| if kwargs.get("section_only", False): | |||
| return chunks | |||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | |||
| cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) | |||
| return res | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from tika import parser | |||
| from io import BytesIO | |||
| from docx import Document | |||
| from timeit import default_timer as timer | |||
| import re | |||
| from deepdoc.parser.pdf_parser import PlainParser | |||
| from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser | |||
| from rag.settings import cron_logger | |||
| from rag.utils import num_tokens_from_string | |||
| from PIL import Image | |||
| from functools import reduce | |||
| from markdown import markdown | |||
| from docx.image.exceptions import UnrecognizedImageError | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def get_picture(self, document, paragraph): | |||
| img = paragraph._element.xpath('.//pic:pic') | |||
| if not img: | |||
| return None | |||
| img = img[0] | |||
| embed = img.xpath('.//a:blip/@r:embed')[0] | |||
| related_part = document.part.related_parts[embed] | |||
| try: | |||
| image_blob = related_part.image.blob | |||
| except UnrecognizedImageError: | |||
| print("Unrecognized image format. Skipping image.") | |||
| return None | |||
| try: | |||
| image = Image.open(BytesIO(image_blob)).convert('RGB') | |||
| return image | |||
| except Exception as e: | |||
| return None | |||
| def __clean(self, line): | |||
| line = re.sub(r"\u3000", " ", line).strip() | |||
| return line | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| lines = [] | |||
| last_image = None | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| if from_page <= pn < to_page: | |||
| if p.text.strip(): | |||
| if p.style and p.style.name == 'Caption': | |||
| former_image = None | |||
| if lines and lines[-1][1] and lines[-1][2] != 'Caption': | |||
| former_image = lines[-1][1].pop() | |||
| elif last_image: | |||
| former_image = last_image | |||
| last_image = None | |||
| lines.append((self.__clean(p.text), [former_image], p.style.name)) | |||
| else: | |||
| current_image = self.get_picture(self.doc, p) | |||
| image_list = [current_image] | |||
| if last_image: | |||
| image_list.insert(0, last_image) | |||
| last_image = None | |||
| lines.append((self.__clean(p.text), image_list, p.style.name)) | |||
| else: | |||
| if current_image := self.get_picture(self.doc, p): | |||
| if lines: | |||
| lines[-1][1].append(current_image) | |||
| else: | |||
| last_image = current_image | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines] | |||
| tbls = [] | |||
| for tb in self.doc.tables: | |||
| html= "<table>" | |||
| for r in tb.rows: | |||
| html += "<tr>" | |||
| i = 0 | |||
| while i < len(r.cells): | |||
| span = 1 | |||
| c = r.cells[i] | |||
| for j in range(i+1, len(r.cells)): | |||
| if c.text == r.cells[j].text: | |||
| span += 1 | |||
| i = j | |||
| i += 1 | |||
| html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>" | |||
| html += "</tr>" | |||
| html += "</table>" | |||
| tbls.append(((None, html), "")) | |||
| return new_line, tbls | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| start = timer() | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.63, "Layout analysis finished.") | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.65, "Table analysis finished.") | |||
| self._text_merge() | |||
| callback(0.67, "Text merging finished") | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| #self._naive_vertical_merge() | |||
| self._concat_downward() | |||
| #self._filter_forpages() | |||
| cron_logger.info("layouts: {}".format(timer() - start)) | |||
| return [(b["text"], self._line_tag(b, zoomin)) | |||
| for b in self.boxes], tbls | |||
| class Markdown(MarkdownParser): | |||
| def __call__(self, filename, binary=None): | |||
| txt = "" | |||
| tbls = [] | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| txt = f.read() | |||
| remainder, tables = self.extract_tables_and_remainder(f'{txt}\n') | |||
| sections = [] | |||
| tbls = [] | |||
| for sec in remainder.split("\n"): | |||
| if num_tokens_from_string(sec) > 10 * self.chunk_token_num: | |||
| sections.append((sec[:int(len(sec)/2)], "")) | |||
| sections.append((sec[int(len(sec)/2):], "")) | |||
| else: | |||
| sections.append((sec, "")) | |||
| print(tables) | |||
| for table in tables: | |||
| tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) | |||
| return sections, tbls | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, excel, txt. | |||
| This method apply the naive ways to chunk files. | |||
| Successive text will be sliced into pieces using 'delimiter'. | |||
| Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. | |||
| """ | |||
| eng = lang.lower() == "english" # is_english(cks) | |||
| parser_config = kwargs.get( | |||
| "parser_config", { | |||
| "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}) | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| res = [] | |||
| pdf_parser = None | |||
| sections = [] | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections, tbls = Docx()(filename, binary) | |||
| res = tokenize_table(tbls, doc, eng) # just for table | |||
| callback(0.8, "Finish parsing.") | |||
| st = timer() | |||
| chunks, images = naive_merge_docx( | |||
| sections, int(parser_config.get( | |||
| "chunk_token_num", 128)), parser_config.get( | |||
| "delimiter", "\n!?。;!?")) | |||
| if kwargs.get("section_only", False): | |||
| return chunks | |||
| res.extend(tokenize_chunks_docx(chunks, doc, eng, images)) | |||
| cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) | |||
| return res | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf( | |||
| ) if parser_config.get("layout_recognize", True) else PlainParser() | |||
| sections, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = ExcelParser() | |||
| sections = [(l, "") for l in excel_parser.html(binary) if l] | |||
| elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = TxtParser()(filename,binary, | |||
| parser_config.get("chunk_token_num", 128), | |||
| parser_config.get("delimiter", "\n!?;。;!?")) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.json$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary) | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") | |||
| st = timer() | |||
| chunks = naive_merge( | |||
| sections, int(parser_config.get( | |||
| "chunk_token_num", 128)), parser_config.get( | |||
| "delimiter", "\n!?。;!?")) | |||
| if kwargs.get("section_only", False): | |||
| return chunks | |||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | |||
| cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) | |||
| return res | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) | |||
| @@ -1,133 +1,133 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from tika import parser | |||
| from io import BytesIO | |||
| import re | |||
| from rag.app import laws | |||
| from rag.nlp import rag_tokenizer, tokenize, find_codec | |||
| from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin, drop=False) | |||
| callback(0.63, "Layout analysis finished.") | |||
| print("layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.65, "Table analysis finished.") | |||
| self._text_merge() | |||
| callback(0.67, "Text merging finished") | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| self._concat_downward() | |||
| sections = [(b["text"], self.get_position(b, zoomin)) | |||
| for i, b in enumerate(self.boxes)] | |||
| for (img, rows), poss in tbls: | |||
| if not rows:continue | |||
| sections.append((rows if isinstance(rows, str) else rows[0], | |||
| [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) | |||
| return [(txt, "") for txt, _ in sorted(sections, key=lambda x: ( | |||
| x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, excel, txt. | |||
| One file forms a chunk which maintains original text order. | |||
| """ | |||
| eng = lang.lower() == "english" # is_english(cks) | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = [txt for txt in laws.Docx()(filename, binary) if txt] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainParser() | |||
| sections, _ = pdf_parser( | |||
| filename if not binary else binary, to_page=to_page, callback=callback) | |||
| sections = [s for s, _ in sections if s] | |||
| elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = ExcelParser() | |||
| sections = excel_parser.html(binary, 1000000000) | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| sections = txt.split("\n") | |||
| sections = [s for s in sections if s] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [s for s in sections if s] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| tokenize(doc, "\n".join(sections), eng) | |||
| return [doc] | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from tika import parser | |||
| from io import BytesIO | |||
| import re | |||
| from rag.app import laws | |||
| from rag.nlp import rag_tokenizer, tokenize, find_codec | |||
| from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin, drop=False) | |||
| callback(0.63, "Layout analysis finished.") | |||
| print("layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.65, "Table analysis finished.") | |||
| self._text_merge() | |||
| callback(0.67, "Text merging finished") | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| self._concat_downward() | |||
| sections = [(b["text"], self.get_position(b, zoomin)) | |||
| for i, b in enumerate(self.boxes)] | |||
| for (img, rows), poss in tbls: | |||
| if not rows:continue | |||
| sections.append((rows if isinstance(rows, str) else rows[0], | |||
| [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) | |||
| return [(txt, "") for txt, _ in sorted(sections, key=lambda x: ( | |||
| x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Supported file formats are docx, pdf, excel, txt. | |||
| One file forms a chunk which maintains original text order. | |||
| """ | |||
| eng = lang.lower() == "english" # is_english(cks) | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = [txt for txt in laws.Docx()(filename, binary) if txt] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainParser() | |||
| sections, _ = pdf_parser( | |||
| filename if not binary else binary, to_page=to_page, callback=callback) | |||
| sections = [s for s, _ in sections if s] | |||
| elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = ExcelParser() | |||
| sections = excel_parser.html(binary, 1000000000) | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| sections = txt.split("\n") | |||
| sections = [s for s in sections if s] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [s for s in sections if s] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| tokenize(doc, "\n".join(sections), eng) | |||
| return [doc] | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) | |||
| @@ -1,287 +1,287 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from collections import Counter | |||
| from api.db import ParserType | |||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | |||
| from deepdoc.parser import PdfParser, PlainParser | |||
| import numpy as np | |||
| from rag.utils import num_tokens_from_string | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| self.model_speciess = ParserType.PAPER.value | |||
| super().__init__() | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished.") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.63, "Layout analysis finished") | |||
| print("layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.68, "Table analysis finished") | |||
| self._text_merge() | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) | |||
| self._concat_downward() | |||
| self._filter_forpages() | |||
| callback(0.75, "Text merging finished.") | |||
| # clean mess | |||
| if column_width < self.page_images[0].size[0] / zoomin / 2: | |||
| print("two_column...................", column_width, | |||
| self.page_images[0].size[0] / zoomin / 2) | |||
| self.boxes = self.sort_X_by_page(self.boxes, column_width / 2) | |||
| for b in self.boxes: | |||
| b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip()) | |||
| def _begin(txt): | |||
| return re.match( | |||
| "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)", | |||
| txt.lower().strip()) | |||
| if from_page > 0: | |||
| return { | |||
| "title": "", | |||
| "authors": "", | |||
| "abstract": "", | |||
| "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if | |||
| re.match(r"(text|title)", b.get("layoutno", "text"))], | |||
| "tables": tbls | |||
| } | |||
| # get title and authors | |||
| title = "" | |||
| authors = [] | |||
| i = 0 | |||
| while i < min(32, len(self.boxes)-1): | |||
| b = self.boxes[i] | |||
| i += 1 | |||
| if b.get("layoutno", "").find("title") >= 0: | |||
| title = b["text"] | |||
| if _begin(title): | |||
| title = "" | |||
| break | |||
| for j in range(3): | |||
| if _begin(self.boxes[i + j]["text"]): | |||
| break | |||
| authors.append(self.boxes[i + j]["text"]) | |||
| break | |||
| break | |||
| # get abstract | |||
| abstr = "" | |||
| i = 0 | |||
| while i + 1 < min(32, len(self.boxes)): | |||
| b = self.boxes[i] | |||
| i += 1 | |||
| txt = b["text"].lower().strip() | |||
| if re.match("(abstract|摘要)", txt): | |||
| if len(txt.split(" ")) > 32 or len(txt) > 64: | |||
| abstr = txt + self._line_tag(b, zoomin) | |||
| break | |||
| txt = self.boxes[i]["text"].lower().strip() | |||
| if len(txt.split(" ")) > 32 or len(txt) > 64: | |||
| abstr = txt + self._line_tag(self.boxes[i], zoomin) | |||
| i += 1 | |||
| break | |||
| if not abstr: | |||
| i = 0 | |||
| callback( | |||
| 0.8, "Page {}~{}: Text merging finished".format( | |||
| from_page, min( | |||
| to_page, self.total_page))) | |||
| for b in self.boxes: | |||
| print(b["text"], b.get("layoutno")) | |||
| print(tbls) | |||
| return { | |||
| "title": title, | |||
| "authors": " ".join(authors), | |||
| "abstract": abstr, | |||
| "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if | |||
| re.match(r"(text|title)", b.get("layoutno", "text"))], | |||
| "tables": tbls | |||
| } | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Only pdf is supported. | |||
| The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. | |||
| """ | |||
| pdf_parser = None | |||
| if re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| if not kwargs.get("parser_config", {}).get("layout_recognize", True): | |||
| pdf_parser = PlainParser() | |||
| paper = { | |||
| "title": filename, | |||
| "authors": " ", | |||
| "abstract": "", | |||
| "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0], | |||
| "tables": [] | |||
| } | |||
| else: | |||
| pdf_parser = Pdf() | |||
| paper = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| else: | |||
| raise NotImplementedError("file type not supported yet(pdf supported)") | |||
| doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]), | |||
| "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)} | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"]) | |||
| # is it English | |||
| eng = lang.lower() == "english" # pdf_parser.is_english | |||
| print("It's English.....", eng) | |||
| res = tokenize_table(paper["tables"], doc, eng) | |||
| if paper["abstract"]: | |||
| d = copy.deepcopy(doc) | |||
| txt = pdf_parser.remove_tag(paper["abstract"]) | |||
| d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"] | |||
| d["important_tks"] = " ".join(d["important_kwd"]) | |||
| d["image"], poss = pdf_parser.crop( | |||
| paper["abstract"], need_position=True) | |||
| add_positions(d, poss) | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| sorted_sections = paper["sections"] | |||
| # set pivot using the most frequent type of title, | |||
| # then merge between 2 pivot | |||
| bull = bullets_category([txt for txt, _ in sorted_sections]) | |||
| most_level, levels = title_frequency(bull, sorted_sections) | |||
| assert len(sorted_sections) == len(levels) | |||
| sec_ids = [] | |||
| sid = 0 | |||
| for i, lvl in enumerate(levels): | |||
| if lvl <= most_level and i > 0 and lvl != levels[i - 1]: | |||
| sid += 1 | |||
| sec_ids.append(sid) | |||
| print(lvl, sorted_sections[i][0], most_level, sid) | |||
| chunks = [] | |||
| last_sid = -2 | |||
| for (txt, _), sec_id in zip(sorted_sections, sec_ids): | |||
| if sec_id == last_sid: | |||
| if chunks: | |||
| chunks[-1] += "\n" + txt | |||
| continue | |||
| chunks.append(txt) | |||
| last_sid = sec_id | |||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | |||
| return res | |||
| """ | |||
| readed = [0] * len(paper["lines"]) | |||
| # find colon firstly | |||
| i = 0 | |||
| while i + 1 < len(paper["lines"]): | |||
| txt = pdf_parser.remove_tag(paper["lines"][i][0]) | |||
| j = i | |||
| if txt.strip("\n").strip()[-1] not in "::": | |||
| i += 1 | |||
| continue | |||
| i += 1 | |||
| while i < len(paper["lines"]) and not paper["lines"][i][0]: | |||
| i += 1 | |||
| if i >= len(paper["lines"]): break | |||
| proj = [paper["lines"][i][0].strip()] | |||
| i += 1 | |||
| while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]: | |||
| proj.append(paper["lines"][i]) | |||
| i += 1 | |||
| for k in range(j, i): readed[k] = True | |||
| txt = txt[::-1] | |||
| if eng: | |||
| r = re.search(r"(.*?) ([\\.;?!]|$)", txt) | |||
| txt = r.group(1)[::-1] if r else txt[::-1] | |||
| else: | |||
| r = re.search(r"(.*?) ([。?;!]|$)", txt) | |||
| txt = r.group(1)[::-1] if r else txt[::-1] | |||
| for p in proj: | |||
| d = copy.deepcopy(doc) | |||
| txt += "\n" + pdf_parser.remove_tag(p) | |||
| d["image"], poss = pdf_parser.crop(p, need_position=True) | |||
| add_positions(d, poss) | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| i = 0 | |||
| chunk = [] | |||
| tk_cnt = 0 | |||
| def add_chunk(): | |||
| nonlocal chunk, res, doc, pdf_parser, tk_cnt | |||
| d = copy.deepcopy(doc) | |||
| ck = "\n".join(chunk) | |||
| tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english) | |||
| d["image"], poss = pdf_parser.crop(ck, need_position=True) | |||
| add_positions(d, poss) | |||
| res.append(d) | |||
| chunk = [] | |||
| tk_cnt = 0 | |||
| while i < len(paper["lines"]): | |||
| if tk_cnt > 128: | |||
| add_chunk() | |||
| if readed[i]: | |||
| i += 1 | |||
| continue | |||
| readed[i] = True | |||
| txt, layouts = paper["lines"][i] | |||
| txt_ = pdf_parser.remove_tag(txt) | |||
| i += 1 | |||
| cnt = num_tokens_from_string(txt_) | |||
| if any([ | |||
| layouts.find("title") >= 0 and chunk, | |||
| cnt + tk_cnt > 128 and tk_cnt > 32, | |||
| ]): | |||
| add_chunk() | |||
| chunk = [txt] | |||
| tk_cnt = cnt | |||
| else: | |||
| chunk.append(txt) | |||
| tk_cnt += cnt | |||
| if chunk: add_chunk() | |||
| for i, d in enumerate(res): | |||
| print(d) | |||
| # d["image"].save(f"./logs/{i}.jpg") | |||
| return res | |||
| """ | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from collections import Counter | |||
| from api.db import ParserType | |||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | |||
| from deepdoc.parser import PdfParser, PlainParser | |||
| import numpy as np | |||
| from rag.utils import num_tokens_from_string | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| self.model_speciess = ParserType.PAPER.value | |||
| super().__init__() | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished.") | |||
| from timeit import default_timer as timer | |||
| start = timer() | |||
| self._layouts_rec(zoomin) | |||
| callback(0.63, "Layout analysis finished") | |||
| print("layouts:", timer() - start) | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.68, "Table analysis finished") | |||
| self._text_merge() | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) | |||
| self._concat_downward() | |||
| self._filter_forpages() | |||
| callback(0.75, "Text merging finished.") | |||
| # clean mess | |||
| if column_width < self.page_images[0].size[0] / zoomin / 2: | |||
| print("two_column...................", column_width, | |||
| self.page_images[0].size[0] / zoomin / 2) | |||
| self.boxes = self.sort_X_by_page(self.boxes, column_width / 2) | |||
| for b in self.boxes: | |||
| b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip()) | |||
| def _begin(txt): | |||
| return re.match( | |||
| "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)", | |||
| txt.lower().strip()) | |||
| if from_page > 0: | |||
| return { | |||
| "title": "", | |||
| "authors": "", | |||
| "abstract": "", | |||
| "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if | |||
| re.match(r"(text|title)", b.get("layoutno", "text"))], | |||
| "tables": tbls | |||
| } | |||
| # get title and authors | |||
| title = "" | |||
| authors = [] | |||
| i = 0 | |||
| while i < min(32, len(self.boxes)-1): | |||
| b = self.boxes[i] | |||
| i += 1 | |||
| if b.get("layoutno", "").find("title") >= 0: | |||
| title = b["text"] | |||
| if _begin(title): | |||
| title = "" | |||
| break | |||
| for j in range(3): | |||
| if _begin(self.boxes[i + j]["text"]): | |||
| break | |||
| authors.append(self.boxes[i + j]["text"]) | |||
| break | |||
| break | |||
| # get abstract | |||
| abstr = "" | |||
| i = 0 | |||
| while i + 1 < min(32, len(self.boxes)): | |||
| b = self.boxes[i] | |||
| i += 1 | |||
| txt = b["text"].lower().strip() | |||
| if re.match("(abstract|摘要)", txt): | |||
| if len(txt.split(" ")) > 32 or len(txt) > 64: | |||
| abstr = txt + self._line_tag(b, zoomin) | |||
| break | |||
| txt = self.boxes[i]["text"].lower().strip() | |||
| if len(txt.split(" ")) > 32 or len(txt) > 64: | |||
| abstr = txt + self._line_tag(self.boxes[i], zoomin) | |||
| i += 1 | |||
| break | |||
| if not abstr: | |||
| i = 0 | |||
| callback( | |||
| 0.8, "Page {}~{}: Text merging finished".format( | |||
| from_page, min( | |||
| to_page, self.total_page))) | |||
| for b in self.boxes: | |||
| print(b["text"], b.get("layoutno")) | |||
| print(tbls) | |||
| return { | |||
| "title": title, | |||
| "authors": " ".join(authors), | |||
| "abstract": abstr, | |||
| "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if | |||
| re.match(r"(text|title)", b.get("layoutno", "text"))], | |||
| "tables": tbls | |||
| } | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Only pdf is supported. | |||
| The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. | |||
| """ | |||
| pdf_parser = None | |||
| if re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| if not kwargs.get("parser_config", {}).get("layout_recognize", True): | |||
| pdf_parser = PlainParser() | |||
| paper = { | |||
| "title": filename, | |||
| "authors": " ", | |||
| "abstract": "", | |||
| "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0], | |||
| "tables": [] | |||
| } | |||
| else: | |||
| pdf_parser = Pdf() | |||
| paper = pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback) | |||
| else: | |||
| raise NotImplementedError("file type not supported yet(pdf supported)") | |||
| doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]), | |||
| "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)} | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"]) | |||
| # is it English | |||
| eng = lang.lower() == "english" # pdf_parser.is_english | |||
| print("It's English.....", eng) | |||
| res = tokenize_table(paper["tables"], doc, eng) | |||
| if paper["abstract"]: | |||
| d = copy.deepcopy(doc) | |||
| txt = pdf_parser.remove_tag(paper["abstract"]) | |||
| d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"] | |||
| d["important_tks"] = " ".join(d["important_kwd"]) | |||
| d["image"], poss = pdf_parser.crop( | |||
| paper["abstract"], need_position=True) | |||
| add_positions(d, poss) | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| sorted_sections = paper["sections"] | |||
| # set pivot using the most frequent type of title, | |||
| # then merge between 2 pivot | |||
| bull = bullets_category([txt for txt, _ in sorted_sections]) | |||
| most_level, levels = title_frequency(bull, sorted_sections) | |||
| assert len(sorted_sections) == len(levels) | |||
| sec_ids = [] | |||
| sid = 0 | |||
| for i, lvl in enumerate(levels): | |||
| if lvl <= most_level and i > 0 and lvl != levels[i - 1]: | |||
| sid += 1 | |||
| sec_ids.append(sid) | |||
| print(lvl, sorted_sections[i][0], most_level, sid) | |||
| chunks = [] | |||
| last_sid = -2 | |||
| for (txt, _), sec_id in zip(sorted_sections, sec_ids): | |||
| if sec_id == last_sid: | |||
| if chunks: | |||
| chunks[-1] += "\n" + txt | |||
| continue | |||
| chunks.append(txt) | |||
| last_sid = sec_id | |||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | |||
| return res | |||
| """ | |||
| readed = [0] * len(paper["lines"]) | |||
| # find colon firstly | |||
| i = 0 | |||
| while i + 1 < len(paper["lines"]): | |||
| txt = pdf_parser.remove_tag(paper["lines"][i][0]) | |||
| j = i | |||
| if txt.strip("\n").strip()[-1] not in "::": | |||
| i += 1 | |||
| continue | |||
| i += 1 | |||
| while i < len(paper["lines"]) and not paper["lines"][i][0]: | |||
| i += 1 | |||
| if i >= len(paper["lines"]): break | |||
| proj = [paper["lines"][i][0].strip()] | |||
| i += 1 | |||
| while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]: | |||
| proj.append(paper["lines"][i]) | |||
| i += 1 | |||
| for k in range(j, i): readed[k] = True | |||
| txt = txt[::-1] | |||
| if eng: | |||
| r = re.search(r"(.*?) ([\\.;?!]|$)", txt) | |||
| txt = r.group(1)[::-1] if r else txt[::-1] | |||
| else: | |||
| r = re.search(r"(.*?) ([。?;!]|$)", txt) | |||
| txt = r.group(1)[::-1] if r else txt[::-1] | |||
| for p in proj: | |||
| d = copy.deepcopy(doc) | |||
| txt += "\n" + pdf_parser.remove_tag(p) | |||
| d["image"], poss = pdf_parser.crop(p, need_position=True) | |||
| add_positions(d, poss) | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| i = 0 | |||
| chunk = [] | |||
| tk_cnt = 0 | |||
| def add_chunk(): | |||
| nonlocal chunk, res, doc, pdf_parser, tk_cnt | |||
| d = copy.deepcopy(doc) | |||
| ck = "\n".join(chunk) | |||
| tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english) | |||
| d["image"], poss = pdf_parser.crop(ck, need_position=True) | |||
| add_positions(d, poss) | |||
| res.append(d) | |||
| chunk = [] | |||
| tk_cnt = 0 | |||
| while i < len(paper["lines"]): | |||
| if tk_cnt > 128: | |||
| add_chunk() | |||
| if readed[i]: | |||
| i += 1 | |||
| continue | |||
| readed[i] = True | |||
| txt, layouts = paper["lines"][i] | |||
| txt_ = pdf_parser.remove_tag(txt) | |||
| i += 1 | |||
| cnt = num_tokens_from_string(txt_) | |||
| if any([ | |||
| layouts.find("title") >= 0 and chunk, | |||
| cnt + tk_cnt > 128 and tk_cnt > 32, | |||
| ]): | |||
| add_chunk() | |||
| chunk = [txt] | |||
| tk_cnt = cnt | |||
| else: | |||
| chunk.append(txt) | |||
| tk_cnt += cnt | |||
| if chunk: add_chunk() | |||
| for i, d in enumerate(res): | |||
| print(d) | |||
| # d["image"].save(f"./logs/{i}.jpg") | |||
| return res | |||
| """ | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| @@ -1,52 +1,52 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import io | |||
| import numpy as np | |||
| from PIL import Image | |||
| from api.db import LLMType | |||
| from api.db.services.llm_service import LLMBundle | |||
| from rag.nlp import tokenize | |||
| from deepdoc.vision import OCR | |||
| ocr = OCR() | |||
| def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): | |||
| img = Image.open(io.BytesIO(binary)).convert('RGB') | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "image": img | |||
| } | |||
| bxs = ocr(np.array(img)) | |||
| txt = "\n".join([t[0] for _, t in bxs if t[0]]) | |||
| eng = lang.lower() == "english" | |||
| callback(0.4, "Finish OCR: (%s ...)" % txt[:12]) | |||
| if (eng and len(txt.split(" ")) > 32) or len(txt) > 32: | |||
| tokenize(doc, txt, eng) | |||
| callback(0.8, "OCR results is too long to use CV LLM.") | |||
| return [doc] | |||
| try: | |||
| callback(0.4, "Use CV LLM to describe the picture.") | |||
| cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang) | |||
| ans = cv_mdl.describe(binary) | |||
| callback(0.8, "CV LLM respond: %s ..." % ans[:32]) | |||
| txt += "\n" + ans | |||
| tokenize(doc, txt, eng) | |||
| return [doc] | |||
| except Exception as e: | |||
| callback(prog=-1, msg=str(e)) | |||
| return [] | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import io | |||
| import numpy as np | |||
| from PIL import Image | |||
| from api.db import LLMType | |||
| from api.db.services.llm_service import LLMBundle | |||
| from rag.nlp import tokenize | |||
| from deepdoc.vision import OCR | |||
| ocr = OCR() | |||
| def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): | |||
| img = Image.open(io.BytesIO(binary)).convert('RGB') | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "image": img | |||
| } | |||
| bxs = ocr(np.array(img)) | |||
| txt = "\n".join([t[0] for _, t in bxs if t[0]]) | |||
| eng = lang.lower() == "english" | |||
| callback(0.4, "Finish OCR: (%s ...)" % txt[:12]) | |||
| if (eng and len(txt.split(" ")) > 32) or len(txt) > 32: | |||
| tokenize(doc, txt, eng) | |||
| callback(0.8, "OCR results is too long to use CV LLM.") | |||
| return [doc] | |||
| try: | |||
| callback(0.4, "Use CV LLM to describe the picture.") | |||
| cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang) | |||
| ans = cv_mdl.describe(binary) | |||
| callback(0.8, "CV LLM respond: %s ..." % ans[:32]) | |||
| txt += "\n" + ans | |||
| tokenize(doc, txt, eng) | |||
| return [doc] | |||
| except Exception as e: | |||
| callback(prog=-1, msg=str(e)) | |||
| return [] | |||
| @@ -1,143 +1,143 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from io import BytesIO | |||
| from PIL import Image | |||
| from rag.nlp import tokenize, is_english | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser import PdfParser, PptParser, PlainParser | |||
| from PyPDF2 import PdfReader as pdf2_read | |||
| class Ppt(PptParser): | |||
| def __call__(self, fnm, from_page, to_page, callback=None): | |||
| txts = super().__call__(fnm, from_page, to_page) | |||
| callback(0.5, "Text extraction finished.") | |||
| import aspose.slides as slides | |||
| import aspose.pydrawing as drawing | |||
| imgs = [] | |||
| with slides.Presentation(BytesIO(fnm)) as presentation: | |||
| for i, slide in enumerate(presentation.slides[from_page: to_page]): | |||
| buffered = BytesIO() | |||
| slide.get_thumbnail( | |||
| 0.5, 0.5).save( | |||
| buffered, drawing.imaging.ImageFormat.jpeg) | |||
| imgs.append(Image.open(buffered)) | |||
| assert len(imgs) == len( | |||
| txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) | |||
| callback(0.9, "Image extraction finished") | |||
| self.is_english = is_english(txts) | |||
| return [(txts[i], imgs[i]) for i in range(len(txts))] | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| super().__init__() | |||
| def __garbage(self, txt): | |||
| txt = txt.lower().strip() | |||
| if re.match(r"[0-9\.,%/-]+$", txt): | |||
| return True | |||
| if len(txt) < 3: | |||
| return True | |||
| return False | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__(filename if not binary else binary, | |||
| zoomin, from_page, to_page, callback) | |||
| callback(0.8, "Page {}~{}: OCR finished".format( | |||
| from_page, min(to_page, self.total_page))) | |||
| assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( | |||
| len(self.boxes), len(self.page_images)) | |||
| res = [] | |||
| for i in range(len(self.boxes)): | |||
| lines = "\n".join([b["text"] for b in self.boxes[i] | |||
| if not self.__garbage(b["text"])]) | |||
| res.append((lines, self.page_images[i])) | |||
| callback(0.9, "Page {}~{}: Parsing finished".format( | |||
| from_page, min(to_page, self.total_page))) | |||
| return res | |||
| class PlainPdf(PlainParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, callback=None, **kwargs): | |||
| self.pdf = pdf2_read(filename if not binary else BytesIO(binary)) | |||
| page_txt = [] | |||
| for page in self.pdf.pages[from_page: to_page]: | |||
| page_txt.append(page.extract_text()) | |||
| callback(0.9, "Parsing finished") | |||
| return [(txt, None) for txt in page_txt] | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| The supported file formats are pdf, pptx. | |||
| Every page will be treated as a chunk. And the thumbnail of every page will be stored. | |||
| PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary. | |||
| """ | |||
| eng = lang.lower() == "english" | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| res = [] | |||
| if re.search(r"\.pptx?$", filename, re.IGNORECASE): | |||
| ppt_parser = Ppt() | |||
| for pn, (txt, img) in enumerate(ppt_parser( | |||
| filename if not binary else binary, from_page, 1000000, callback)): | |||
| d = copy.deepcopy(doc) | |||
| pn += from_page | |||
| d["image"] = img | |||
| d["page_num_int"] = [pn + 1] | |||
| d["top_int"] = [0] | |||
| d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| return res | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainPdf() | |||
| for pn, (txt, img) in enumerate(pdf_parser(filename, binary, | |||
| from_page=from_page, to_page=to_page, callback=callback)): | |||
| d = copy.deepcopy(doc) | |||
| pn += from_page | |||
| if img: | |||
| d["image"] = img | |||
| d["page_num_int"] = [pn + 1] | |||
| d["top_int"] = [0] | |||
| d["position_int"] = [ | |||
| (pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)] | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| return res | |||
| raise NotImplementedError( | |||
| "file type not supported yet(pptx, pdf supported)") | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(a, b): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from io import BytesIO | |||
| from PIL import Image | |||
| from rag.nlp import tokenize, is_english | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser import PdfParser, PptParser, PlainParser | |||
| from PyPDF2 import PdfReader as pdf2_read | |||
| class Ppt(PptParser): | |||
| def __call__(self, fnm, from_page, to_page, callback=None): | |||
| txts = super().__call__(fnm, from_page, to_page) | |||
| callback(0.5, "Text extraction finished.") | |||
| import aspose.slides as slides | |||
| import aspose.pydrawing as drawing | |||
| imgs = [] | |||
| with slides.Presentation(BytesIO(fnm)) as presentation: | |||
| for i, slide in enumerate(presentation.slides[from_page: to_page]): | |||
| buffered = BytesIO() | |||
| slide.get_thumbnail( | |||
| 0.5, 0.5).save( | |||
| buffered, drawing.imaging.ImageFormat.jpeg) | |||
| imgs.append(Image.open(buffered)) | |||
| assert len(imgs) == len( | |||
| txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) | |||
| callback(0.9, "Image extraction finished") | |||
| self.is_english = is_english(txts) | |||
| return [(txts[i], imgs[i]) for i in range(len(txts))] | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| super().__init__() | |||
| def __garbage(self, txt): | |||
| txt = txt.lower().strip() | |||
| if re.match(r"[0-9\.,%/-]+$", txt): | |||
| return True | |||
| if len(txt) < 3: | |||
| return True | |||
| return False | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| callback(msg="OCR is running...") | |||
| self.__images__(filename if not binary else binary, | |||
| zoomin, from_page, to_page, callback) | |||
| callback(0.8, "Page {}~{}: OCR finished".format( | |||
| from_page, min(to_page, self.total_page))) | |||
| assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( | |||
| len(self.boxes), len(self.page_images)) | |||
| res = [] | |||
| for i in range(len(self.boxes)): | |||
| lines = "\n".join([b["text"] for b in self.boxes[i] | |||
| if not self.__garbage(b["text"])]) | |||
| res.append((lines, self.page_images[i])) | |||
| callback(0.9, "Page {}~{}: Parsing finished".format( | |||
| from_page, min(to_page, self.total_page))) | |||
| return res | |||
| class PlainPdf(PlainParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, callback=None, **kwargs): | |||
| self.pdf = pdf2_read(filename if not binary else BytesIO(binary)) | |||
| page_txt = [] | |||
| for page in self.pdf.pages[from_page: to_page]: | |||
| page_txt.append(page.extract_text()) | |||
| callback(0.9, "Parsing finished") | |||
| return [(txt, None) for txt in page_txt] | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| The supported file formats are pdf, pptx. | |||
| Every page will be treated as a chunk. And the thumbnail of every page will be stored. | |||
| PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary. | |||
| """ | |||
| eng = lang.lower() == "english" | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| res = [] | |||
| if re.search(r"\.pptx?$", filename, re.IGNORECASE): | |||
| ppt_parser = Ppt() | |||
| for pn, (txt, img) in enumerate(ppt_parser( | |||
| filename if not binary else binary, from_page, 1000000, callback)): | |||
| d = copy.deepcopy(doc) | |||
| pn += from_page | |||
| d["image"] = img | |||
| d["page_num_int"] = [pn + 1] | |||
| d["top_int"] = [0] | |||
| d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| return res | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get( | |||
| "parser_config", {}).get( | |||
| "layout_recognize", True) else PlainPdf() | |||
| for pn, (txt, img) in enumerate(pdf_parser(filename, binary, | |||
| from_page=from_page, to_page=to_page, callback=callback)): | |||
| d = copy.deepcopy(doc) | |||
| pn += from_page | |||
| if img: | |||
| d["image"] = img | |||
| d["page_num_int"] = [pn + 1] | |||
| d["top_int"] = [0] | |||
| d["position_int"] = [ | |||
| (pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)] | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| return res | |||
| raise NotImplementedError( | |||
| "file type not supported yet(pptx, pdf supported)") | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(a, b): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| @@ -1,422 +1,422 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import re | |||
| from copy import deepcopy | |||
| from io import BytesIO | |||
| from timeit import default_timer as timer | |||
| from nltk import word_tokenize | |||
| from openpyxl import load_workbook | |||
| from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level | |||
| from rag.nlp import rag_tokenizer, tokenize_table, concat_img | |||
| from rag.settings import cron_logger | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | |||
| from docx import Document | |||
| from PIL import Image | |||
| from markdown import markdown | |||
| class Excel(ExcelParser): | |||
| def __call__(self, fnm, binary=None, callback=None): | |||
| if not binary: | |||
| wb = load_workbook(fnm) | |||
| else: | |||
| wb = load_workbook(BytesIO(binary)) | |||
| total = 0 | |||
| for sheetname in wb.sheetnames: | |||
| total += len(list(wb[sheetname].rows)) | |||
| res, fails = [], [] | |||
| for sheetname in wb.sheetnames: | |||
| ws = wb[sheetname] | |||
| rows = list(ws.rows) | |||
| for i, r in enumerate(rows): | |||
| q, a = "", "" | |||
| for cell in r: | |||
| if not cell.value: | |||
| continue | |||
| if not q: | |||
| q = str(cell.value) | |||
| elif not a: | |||
| a = str(cell.value) | |||
| else: | |||
| break | |||
| if q and a: | |||
| res.append((q, a)) | |||
| else: | |||
| fails.append(str(i + 1)) | |||
| if len(res) % 999 == 0: | |||
| callback(len(res) * | |||
| 0.6 / | |||
| total, ("Extract Q&A: {}".format(len(res)) + | |||
| (f"{len(fails)} failure, line: %s..." % | |||
| (",".join(fails[:3])) if fails else ""))) | |||
| callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| self.is_english = is_english( | |||
| [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1]) | |||
| return res | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| start = timer() | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) | |||
| start = timer() | |||
| self._layouts_rec(zoomin, drop=False) | |||
| callback(0.63, "Layout analysis finished.") | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.65, "Table analysis finished.") | |||
| self._text_merge() | |||
| callback(0.67, "Text merging finished") | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| #self._naive_vertical_merge() | |||
| # self._concat_downward() | |||
| #self._filter_forpages() | |||
| cron_logger.info("layouts: {}".format(timer() - start)) | |||
| sections = [b["text"] for b in self.boxes] | |||
| bull_x0_list = [] | |||
| q_bull, reg = qbullets_category(sections) | |||
| if q_bull == -1: | |||
| raise ValueError("Unable to recognize Q&A structure.") | |||
| qai_list = [] | |||
| last_q, last_a, last_tag = '', '', '' | |||
| last_index = -1 | |||
| last_box = {'text':''} | |||
| last_bull = None | |||
| def sort_key(element): | |||
| tbls_pn = element[1][0][0] | |||
| tbls_top = element[1][0][3] | |||
| return tbls_pn, tbls_top | |||
| tbls.sort(key=sort_key) | |||
| tbl_index = 0 | |||
| last_pn, last_bottom = 0, 0 | |||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', '' | |||
| for box in self.boxes: | |||
| section, line_tag = box['text'], self._line_tag(box, zoomin) | |||
| has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list) | |||
| last_box, last_index, last_bull = box, index, has_bull | |||
| line_pn = float(line_tag.lstrip('@@').split('\t')[0]) | |||
| line_top = float(line_tag.rstrip('##').split('\t')[3]) | |||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) | |||
| if not has_bull: # No question bullet | |||
| if not last_q: | |||
| if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): # image passed | |||
| tbl_index += 1 | |||
| continue | |||
| else: | |||
| sum_tag = line_tag | |||
| sum_section = section | |||
| while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \ | |||
| and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the middle of current answer | |||
| sum_tag = f'{tbl_tag}{sum_tag}' | |||
| sum_section = f'{tbl_text}{sum_section}' | |||
| tbl_index += 1 | |||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) | |||
| last_a = f'{last_a}{sum_section}' | |||
| last_tag = f'{last_tag}{sum_tag}' | |||
| else: | |||
| if last_q: | |||
| while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \ | |||
| and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the end of last answer | |||
| last_tag = f'{last_tag}{tbl_tag}' | |||
| last_a = f'{last_a}{tbl_text}' | |||
| tbl_index += 1 | |||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) | |||
| image, poss = self.crop(last_tag, need_position=True) | |||
| qai_list.append((last_q, last_a, image, poss)) | |||
| last_q, last_a, last_tag = '', '', '' | |||
| last_q = has_bull.group() | |||
| _, end = has_bull.span() | |||
| last_a = section[end:] | |||
| last_tag = line_tag | |||
| last_bottom = float(line_tag.rstrip('##').split('\t')[4]) | |||
| last_pn = line_pn | |||
| if last_q: | |||
| qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True))) | |||
| return qai_list, tbls | |||
| def get_tbls_info(self, tbls, tbl_index): | |||
| if tbl_index >= len(tbls): | |||
| return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', '' | |||
| tbl_pn = tbls[tbl_index][1][0][0]+1 | |||
| tbl_left = tbls[tbl_index][1][0][1] | |||
| tbl_right = tbls[tbl_index][1][0][2] | |||
| tbl_top = tbls[tbl_index][1][0][3] | |||
| tbl_bottom = tbls[tbl_index][1][0][4] | |||
| tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ | |||
| .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom) | |||
| tbl_text = ''.join(tbls[tbl_index][0][1]) | |||
| return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def get_picture(self, document, paragraph): | |||
| img = paragraph._element.xpath('.//pic:pic') | |||
| if not img: | |||
| return None | |||
| img = img[0] | |||
| embed = img.xpath('.//a:blip/@r:embed')[0] | |||
| related_part = document.part.related_parts[embed] | |||
| image = related_part.image | |||
| image = Image.open(BytesIO(image.blob)).convert('RGB') | |||
| return image | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| last_answer, last_image = "", None | |||
| question_stack, level_stack = [], [] | |||
| qai_list = [] | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| question_level, p_text = 0, '' | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| question_level, p_text = docx_question_level(p) | |||
| if not question_level or question_level > 6: # not a question | |||
| last_answer = f'{last_answer}\n{p_text}' | |||
| current_image = self.get_picture(self.doc, p) | |||
| last_image = concat_img(last_image, current_image) | |||
| else: # is a question | |||
| if last_answer or last_image: | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| qai_list.append((sum_question, last_answer, last_image)) | |||
| last_answer, last_image = '', None | |||
| i = question_level | |||
| while question_stack and i <= level_stack[-1]: | |||
| question_stack.pop() | |||
| level_stack.pop() | |||
| question_stack.append(p_text) | |||
| level_stack.append(question_level) | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| if last_answer: | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| qai_list.append((sum_question, last_answer, last_image)) | |||
| tbls = [] | |||
| for tb in self.doc.tables: | |||
| html= "<table>" | |||
| for r in tb.rows: | |||
| html += "<tr>" | |||
| i = 0 | |||
| while i < len(r.cells): | |||
| span = 1 | |||
| c = r.cells[i] | |||
| for j in range(i+1, len(r.cells)): | |||
| if c.text == r.cells[j].text: | |||
| span += 1 | |||
| i = j | |||
| i += 1 | |||
| html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>" | |||
| html += "</tr>" | |||
| html += "</table>" | |||
| tbls.append(((None, html), "")) | |||
| return qai_list, tbls | |||
| def rmPrefix(txt): | |||
| return re.sub( | |||
| r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE) | |||
| def beAdocPdf(d, q, a, eng, image, poss): | |||
| qprefix = "Question: " if eng else "问题:" | |||
| aprefix = "Answer: " if eng else "回答:" | |||
| d["content_with_weight"] = "\t".join( | |||
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | |||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["image"] = image | |||
| add_positions(d, poss) | |||
| return d | |||
| def beAdocDocx(d, q, a, eng, image): | |||
| qprefix = "Question: " if eng else "问题:" | |||
| aprefix = "Answer: " if eng else "回答:" | |||
| d["content_with_weight"] = "\t".join( | |||
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | |||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["image"] = image | |||
| return d | |||
| def beAdoc(d, q, a, eng): | |||
| qprefix = "Question: " if eng else "问题:" | |||
| aprefix = "Answer: " if eng else "回答:" | |||
| d["content_with_weight"] = "\t".join( | |||
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | |||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| return d | |||
| def mdQuestionLevel(s): | |||
| match = re.match(r'#*', s) | |||
| return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) | |||
| def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Excel and csv(txt) format files are supported. | |||
| If the file is in excel format, there should be 2 column question and answer without header. | |||
| And question column is ahead of answer column. | |||
| And it's O.K if it has multiple sheets as long as the columns are rightly composed. | |||
| If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer. | |||
| All the deformed lines will be ignored. | |||
| Every pair of Q&A will be treated as a chunk. | |||
| """ | |||
| eng = lang.lower() == "english" | |||
| res = [] | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = Excel() | |||
| for q, a in excel_parser(filename, binary, callback): | |||
| res.append(beAdoc(deepcopy(doc), q, a, eng)) | |||
| return res | |||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| lines = txt.split("\n") | |||
| comma, tab = 0, 0 | |||
| for l in lines: | |||
| if len(l.split(",")) == 2: comma += 1 | |||
| if len(l.split("\t")) == 2: tab += 1 | |||
| delimiter = "\t" if tab >= comma else "," | |||
| fails = [] | |||
| question, answer = "", "" | |||
| i = 0 | |||
| while i < len(lines): | |||
| arr = lines[i].split(delimiter) | |||
| if len(arr) != 2: | |||
| if question: answer += "\n" + lines[i] | |||
| else: | |||
| fails.append(str(i+1)) | |||
| elif len(arr) == 2: | |||
| if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng)) | |||
| question, answer = arr | |||
| i += 1 | |||
| if len(res) % 999 == 0: | |||
| callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| if question: res.append(beAdoc(deepcopy(doc), question, answer, eng)) | |||
| callback(0.6, ("Extract Q&A: {}".format(len(res)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| return res | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| pdf_parser = Pdf() | |||
| qai_list, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=0, to_page=10000, callback=callback) | |||
| for q, a, image, poss in qai_list: | |||
| res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss)) | |||
| return res | |||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| lines = txt.split("\n") | |||
| last_question, last_answer = "", "" | |||
| question_stack, level_stack = [], [] | |||
| code_block = False | |||
| level_index = [-1] * 7 | |||
| for index, l in enumerate(lines): | |||
| if l.strip().startswith('```'): | |||
| code_block = not code_block | |||
| question_level, question = 0, '' | |||
| if not code_block: | |||
| question_level, question = mdQuestionLevel(l) | |||
| if not question_level or question_level > 6: # not a question | |||
| last_answer = f'{last_answer}\n{l}' | |||
| else: # is a question | |||
| if last_answer.strip(): | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng)) | |||
| last_answer = '' | |||
| i = question_level | |||
| while question_stack and i <= level_stack[-1]: | |||
| question_stack.pop() | |||
| level_stack.pop() | |||
| question_stack.append(question) | |||
| level_stack.append(question_level) | |||
| if last_answer.strip(): | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng)) | |||
| return res | |||
| elif re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| docx_parser = Docx() | |||
| qai_list, tbls = docx_parser(filename, binary, | |||
| from_page=0, to_page=10000, callback=callback) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| for q, a, image in qai_list: | |||
| res.append(beAdocDocx(deepcopy(doc), q, a, eng, image)) | |||
| return res | |||
| raise NotImplementedError( | |||
| "Excel, csv(txt), pdf, markdown and docx format files are supported.") | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import re | |||
| from copy import deepcopy | |||
| from io import BytesIO | |||
| from timeit import default_timer as timer | |||
| from nltk import word_tokenize | |||
| from openpyxl import load_workbook | |||
| from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level | |||
| from rag.nlp import rag_tokenizer, tokenize_table, concat_img | |||
| from rag.settings import cron_logger | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | |||
| from docx import Document | |||
| from PIL import Image | |||
| from markdown import markdown | |||
| class Excel(ExcelParser): | |||
| def __call__(self, fnm, binary=None, callback=None): | |||
| if not binary: | |||
| wb = load_workbook(fnm) | |||
| else: | |||
| wb = load_workbook(BytesIO(binary)) | |||
| total = 0 | |||
| for sheetname in wb.sheetnames: | |||
| total += len(list(wb[sheetname].rows)) | |||
| res, fails = [], [] | |||
| for sheetname in wb.sheetnames: | |||
| ws = wb[sheetname] | |||
| rows = list(ws.rows) | |||
| for i, r in enumerate(rows): | |||
| q, a = "", "" | |||
| for cell in r: | |||
| if not cell.value: | |||
| continue | |||
| if not q: | |||
| q = str(cell.value) | |||
| elif not a: | |||
| a = str(cell.value) | |||
| else: | |||
| break | |||
| if q and a: | |||
| res.append((q, a)) | |||
| else: | |||
| fails.append(str(i + 1)) | |||
| if len(res) % 999 == 0: | |||
| callback(len(res) * | |||
| 0.6 / | |||
| total, ("Extract Q&A: {}".format(len(res)) + | |||
| (f"{len(fails)} failure, line: %s..." % | |||
| (",".join(fails[:3])) if fails else ""))) | |||
| callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| self.is_english = is_english( | |||
| [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1]) | |||
| return res | |||
| class Pdf(PdfParser): | |||
| def __call__(self, filename, binary=None, from_page=0, | |||
| to_page=100000, zoomin=3, callback=None): | |||
| start = timer() | |||
| callback(msg="OCR is running...") | |||
| self.__images__( | |||
| filename if not binary else binary, | |||
| zoomin, | |||
| from_page, | |||
| to_page, | |||
| callback | |||
| ) | |||
| callback(msg="OCR finished") | |||
| cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) | |||
| start = timer() | |||
| self._layouts_rec(zoomin, drop=False) | |||
| callback(0.63, "Layout analysis finished.") | |||
| self._table_transformer_job(zoomin) | |||
| callback(0.65, "Table analysis finished.") | |||
| self._text_merge() | |||
| callback(0.67, "Text merging finished") | |||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||
| #self._naive_vertical_merge() | |||
| # self._concat_downward() | |||
| #self._filter_forpages() | |||
| cron_logger.info("layouts: {}".format(timer() - start)) | |||
| sections = [b["text"] for b in self.boxes] | |||
| bull_x0_list = [] | |||
| q_bull, reg = qbullets_category(sections) | |||
| if q_bull == -1: | |||
| raise ValueError("Unable to recognize Q&A structure.") | |||
| qai_list = [] | |||
| last_q, last_a, last_tag = '', '', '' | |||
| last_index = -1 | |||
| last_box = {'text':''} | |||
| last_bull = None | |||
| def sort_key(element): | |||
| tbls_pn = element[1][0][0] | |||
| tbls_top = element[1][0][3] | |||
| return tbls_pn, tbls_top | |||
| tbls.sort(key=sort_key) | |||
| tbl_index = 0 | |||
| last_pn, last_bottom = 0, 0 | |||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', '' | |||
| for box in self.boxes: | |||
| section, line_tag = box['text'], self._line_tag(box, zoomin) | |||
| has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list) | |||
| last_box, last_index, last_bull = box, index, has_bull | |||
| line_pn = float(line_tag.lstrip('@@').split('\t')[0]) | |||
| line_top = float(line_tag.rstrip('##').split('\t')[3]) | |||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) | |||
| if not has_bull: # No question bullet | |||
| if not last_q: | |||
| if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): # image passed | |||
| tbl_index += 1 | |||
| continue | |||
| else: | |||
| sum_tag = line_tag | |||
| sum_section = section | |||
| while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \ | |||
| and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the middle of current answer | |||
| sum_tag = f'{tbl_tag}{sum_tag}' | |||
| sum_section = f'{tbl_text}{sum_section}' | |||
| tbl_index += 1 | |||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) | |||
| last_a = f'{last_a}{sum_section}' | |||
| last_tag = f'{last_tag}{sum_tag}' | |||
| else: | |||
| if last_q: | |||
| while ((tbl_pn == last_pn and tbl_top>= last_bottom) or (tbl_pn > last_pn)) \ | |||
| and ((tbl_pn == line_pn and tbl_top <= line_top) or (tbl_pn < line_pn)): # add image at the end of last answer | |||
| last_tag = f'{last_tag}{tbl_tag}' | |||
| last_a = f'{last_a}{tbl_text}' | |||
| tbl_index += 1 | |||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) | |||
| image, poss = self.crop(last_tag, need_position=True) | |||
| qai_list.append((last_q, last_a, image, poss)) | |||
| last_q, last_a, last_tag = '', '', '' | |||
| last_q = has_bull.group() | |||
| _, end = has_bull.span() | |||
| last_a = section[end:] | |||
| last_tag = line_tag | |||
| last_bottom = float(line_tag.rstrip('##').split('\t')[4]) | |||
| last_pn = line_pn | |||
| if last_q: | |||
| qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True))) | |||
| return qai_list, tbls | |||
| def get_tbls_info(self, tbls, tbl_index): | |||
| if tbl_index >= len(tbls): | |||
| return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', '' | |||
| tbl_pn = tbls[tbl_index][1][0][0]+1 | |||
| tbl_left = tbls[tbl_index][1][0][1] | |||
| tbl_right = tbls[tbl_index][1][0][2] | |||
| tbl_top = tbls[tbl_index][1][0][3] | |||
| tbl_bottom = tbls[tbl_index][1][0][4] | |||
| tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ | |||
| .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom) | |||
| tbl_text = ''.join(tbls[tbl_index][0][1]) | |||
| return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def get_picture(self, document, paragraph): | |||
| img = paragraph._element.xpath('.//pic:pic') | |||
| if not img: | |||
| return None | |||
| img = img[0] | |||
| embed = img.xpath('.//a:blip/@r:embed')[0] | |||
| related_part = document.part.related_parts[embed] | |||
| image = related_part.image | |||
| image = Image.open(BytesIO(image.blob)).convert('RGB') | |||
| return image | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): | |||
| self.doc = Document( | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| last_answer, last_image = "", None | |||
| question_stack, level_stack = [], [] | |||
| qai_list = [] | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| question_level, p_text = 0, '' | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| question_level, p_text = docx_question_level(p) | |||
| if not question_level or question_level > 6: # not a question | |||
| last_answer = f'{last_answer}\n{p_text}' | |||
| current_image = self.get_picture(self.doc, p) | |||
| last_image = concat_img(last_image, current_image) | |||
| else: # is a question | |||
| if last_answer or last_image: | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| qai_list.append((sum_question, last_answer, last_image)) | |||
| last_answer, last_image = '', None | |||
| i = question_level | |||
| while question_stack and i <= level_stack[-1]: | |||
| question_stack.pop() | |||
| level_stack.pop() | |||
| question_stack.append(p_text) | |||
| level_stack.append(question_level) | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| if last_answer: | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| qai_list.append((sum_question, last_answer, last_image)) | |||
| tbls = [] | |||
| for tb in self.doc.tables: | |||
| html= "<table>" | |||
| for r in tb.rows: | |||
| html += "<tr>" | |||
| i = 0 | |||
| while i < len(r.cells): | |||
| span = 1 | |||
| c = r.cells[i] | |||
| for j in range(i+1, len(r.cells)): | |||
| if c.text == r.cells[j].text: | |||
| span += 1 | |||
| i = j | |||
| i += 1 | |||
| html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>" | |||
| html += "</tr>" | |||
| html += "</table>" | |||
| tbls.append(((None, html), "")) | |||
| return qai_list, tbls | |||
| def rmPrefix(txt): | |||
| return re.sub( | |||
| r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE) | |||
| def beAdocPdf(d, q, a, eng, image, poss): | |||
| qprefix = "Question: " if eng else "问题:" | |||
| aprefix = "Answer: " if eng else "回答:" | |||
| d["content_with_weight"] = "\t".join( | |||
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | |||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["image"] = image | |||
| add_positions(d, poss) | |||
| return d | |||
| def beAdocDocx(d, q, a, eng, image): | |||
| qprefix = "Question: " if eng else "问题:" | |||
| aprefix = "Answer: " if eng else "回答:" | |||
| d["content_with_weight"] = "\t".join( | |||
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | |||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["image"] = image | |||
| return d | |||
| def beAdoc(d, q, a, eng): | |||
| qprefix = "Question: " if eng else "问题:" | |||
| aprefix = "Answer: " if eng else "回答:" | |||
| d["content_with_weight"] = "\t".join( | |||
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | |||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| return d | |||
| def mdQuestionLevel(s): | |||
| match = re.match(r'#*', s) | |||
| return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) | |||
| def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Excel and csv(txt) format files are supported. | |||
| If the file is in excel format, there should be 2 column question and answer without header. | |||
| And question column is ahead of answer column. | |||
| And it's O.K if it has multiple sheets as long as the columns are rightly composed. | |||
| If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer. | |||
| All the deformed lines will be ignored. | |||
| Every pair of Q&A will be treated as a chunk. | |||
| """ | |||
| eng = lang.lower() == "english" | |||
| res = [] | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = Excel() | |||
| for q, a in excel_parser(filename, binary, callback): | |||
| res.append(beAdoc(deepcopy(doc), q, a, eng)) | |||
| return res | |||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| lines = txt.split("\n") | |||
| comma, tab = 0, 0 | |||
| for l in lines: | |||
| if len(l.split(",")) == 2: comma += 1 | |||
| if len(l.split("\t")) == 2: tab += 1 | |||
| delimiter = "\t" if tab >= comma else "," | |||
| fails = [] | |||
| question, answer = "", "" | |||
| i = 0 | |||
| while i < len(lines): | |||
| arr = lines[i].split(delimiter) | |||
| if len(arr) != 2: | |||
| if question: answer += "\n" + lines[i] | |||
| else: | |||
| fails.append(str(i+1)) | |||
| elif len(arr) == 2: | |||
| if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng)) | |||
| question, answer = arr | |||
| i += 1 | |||
| if len(res) % 999 == 0: | |||
| callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| if question: res.append(beAdoc(deepcopy(doc), question, answer, eng)) | |||
| callback(0.6, ("Extract Q&A: {}".format(len(res)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| return res | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| pdf_parser = Pdf() | |||
| qai_list, tbls = pdf_parser(filename if not binary else binary, | |||
| from_page=0, to_page=10000, callback=callback) | |||
| for q, a, image, poss in qai_list: | |||
| res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss)) | |||
| return res | |||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| lines = txt.split("\n") | |||
| last_question, last_answer = "", "" | |||
| question_stack, level_stack = [], [] | |||
| code_block = False | |||
| level_index = [-1] * 7 | |||
| for index, l in enumerate(lines): | |||
| if l.strip().startswith('```'): | |||
| code_block = not code_block | |||
| question_level, question = 0, '' | |||
| if not code_block: | |||
| question_level, question = mdQuestionLevel(l) | |||
| if not question_level or question_level > 6: # not a question | |||
| last_answer = f'{last_answer}\n{l}' | |||
| else: # is a question | |||
| if last_answer.strip(): | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng)) | |||
| last_answer = '' | |||
| i = question_level | |||
| while question_stack and i <= level_stack[-1]: | |||
| question_stack.pop() | |||
| level_stack.pop() | |||
| question_stack.append(question) | |||
| level_stack.append(question_level) | |||
| if last_answer.strip(): | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng)) | |||
| return res | |||
| elif re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| docx_parser = Docx() | |||
| qai_list, tbls = docx_parser(filename, binary, | |||
| from_page=0, to_page=10000, callback=callback) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| for q, a, image in qai_list: | |||
| res.append(beAdocDocx(deepcopy(doc), q, a, eng, image)) | |||
| return res | |||
| raise NotImplementedError( | |||
| "Excel, csv(txt), pdf, markdown and docx format files are supported.") | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) | |||
| @@ -1,173 +1,173 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import base64 | |||
| import datetime | |||
| import json | |||
| import re | |||
| import pandas as pd | |||
| import requests | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser.resume import refactor | |||
| from deepdoc.parser.resume import step_one, step_two | |||
| from rag.settings import cron_logger | |||
| from rag.utils import rmSpace | |||
| forbidden_select_fields4resume = [ | |||
| "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd" | |||
| ] | |||
| def remote_call(filename, binary): | |||
| q = { | |||
| "header": { | |||
| "uid": 1, | |||
| "user": "kevinhu", | |||
| "log_id": filename | |||
| }, | |||
| "request": { | |||
| "p": { | |||
| "request_id": "1", | |||
| "encrypt_type": "base64", | |||
| "filename": filename, | |||
| "langtype": '', | |||
| "fileori": base64.b64encode(binary).decode('utf-8') | |||
| }, | |||
| "c": "resume_parse_module", | |||
| "m": "resume_parse" | |||
| } | |||
| } | |||
| for _ in range(3): | |||
| try: | |||
| resume = requests.post( | |||
| "http://127.0.0.1:61670/tog", | |||
| data=json.dumps(q)) | |||
| resume = resume.json()["response"]["results"] | |||
| resume = refactor(resume) | |||
| for k in ["education", "work", "project", | |||
| "training", "skill", "certificate", "language"]: | |||
| if not resume.get(k) and k in resume: | |||
| del resume[k] | |||
| resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x", | |||
| "updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}])) | |||
| resume = step_two.parse(resume) | |||
| return resume | |||
| except Exception as e: | |||
| cron_logger.error("Resume parser error: " + str(e)) | |||
| return {} | |||
| def chunk(filename, binary=None, callback=None, **kwargs): | |||
| """ | |||
| The supported file formats are pdf, docx and txt. | |||
| To maximize the effectiveness, parse the resume correctly, please concat us: https://github.com/infiniflow/ragflow | |||
| """ | |||
| if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE): | |||
| raise NotImplementedError("file type not supported yet(pdf supported)") | |||
| if not binary: | |||
| with open(filename, "rb") as f: | |||
| binary = f.read() | |||
| callback(0.2, "Resume parsing is going on...") | |||
| resume = remote_call(filename, binary) | |||
| if len(resume.keys()) < 7: | |||
| callback(-1, "Resume is not successfully parsed.") | |||
| raise Exception("Resume parser remote call fail!") | |||
| callback(0.6, "Done parsing. Chunking...") | |||
| print(json.dumps(resume, ensure_ascii=False, indent=2)) | |||
| field_map = { | |||
| "name_kwd": "姓名/名字", | |||
| "name_pinyin_kwd": "姓名拼音/名字拼音", | |||
| "gender_kwd": "性别(男,女)", | |||
| "age_int": "年龄/岁/年纪", | |||
| "phone_kwd": "电话/手机/微信", | |||
| "email_tks": "email/e-mail/邮箱", | |||
| "position_name_tks": "职位/职能/岗位/职责", | |||
| "expect_city_names_tks": "期望城市", | |||
| "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年", | |||
| "corporation_name_tks": "最近就职(上班)的公司/上一家公司", | |||
| "first_school_name_tks": "第一学历毕业学校", | |||
| "first_degree_kwd": "第一学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", | |||
| "highest_degree_kwd": "最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", | |||
| "first_major_tks": "第一学历专业", | |||
| "edu_first_fea_kwd": "第一学历标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)", | |||
| "degree_kwd": "过往学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", | |||
| "major_tks": "学过的专业/过往专业", | |||
| "school_name_tks": "学校/毕业院校", | |||
| "sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)", | |||
| "edu_fea_kwd": "教育标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)", | |||
| "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司", | |||
| "edu_end_int": "毕业年份", | |||
| "industry_name_tks": "所在行业", | |||
| "birth_dt": "生日/出生年份", | |||
| "expect_position_name_tks": "期望职位/期望职能/期望岗位", | |||
| } | |||
| titles = [] | |||
| for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]: | |||
| v = resume.get(n, "") | |||
| if isinstance(v, list): | |||
| v = v[0] | |||
| if n.find("tks") > 0: | |||
| v = rmSpace(v) | |||
| titles.append(str(v)) | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历") | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| pairs = [] | |||
| for n, m in field_map.items(): | |||
| if not resume.get(n): | |||
| continue | |||
| v = resume[n] | |||
| if isinstance(v, list): | |||
| v = " ".join(v) | |||
| if n.find("tks") > 0: | |||
| v = rmSpace(v) | |||
| pairs.append((m, str(v))) | |||
| doc["content_with_weight"] = "\n".join( | |||
| ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs]) | |||
| doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"]) | |||
| doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"]) | |||
| for n, _ in field_map.items(): | |||
| if n not in resume: | |||
| continue | |||
| if isinstance(resume[n], list) and ( | |||
| len(resume[n]) == 1 or n not in forbidden_select_fields4resume): | |||
| resume[n] = resume[n][0] | |||
| if n.find("_tks") > 0: | |||
| resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n]) | |||
| doc[n] = resume[n] | |||
| print(doc) | |||
| KnowledgebaseService.update_parser_config( | |||
| kwargs["kb_id"], {"field_map": field_map}) | |||
| return [doc] | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(a, b): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import base64 | |||
| import datetime | |||
| import json | |||
| import re | |||
| import pandas as pd | |||
| import requests | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser.resume import refactor | |||
| from deepdoc.parser.resume import step_one, step_two | |||
| from rag.settings import cron_logger | |||
| from rag.utils import rmSpace | |||
| forbidden_select_fields4resume = [ | |||
| "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd" | |||
| ] | |||
| def remote_call(filename, binary): | |||
| q = { | |||
| "header": { | |||
| "uid": 1, | |||
| "user": "kevinhu", | |||
| "log_id": filename | |||
| }, | |||
| "request": { | |||
| "p": { | |||
| "request_id": "1", | |||
| "encrypt_type": "base64", | |||
| "filename": filename, | |||
| "langtype": '', | |||
| "fileori": base64.b64encode(binary).decode('utf-8') | |||
| }, | |||
| "c": "resume_parse_module", | |||
| "m": "resume_parse" | |||
| } | |||
| } | |||
| for _ in range(3): | |||
| try: | |||
| resume = requests.post( | |||
| "http://127.0.0.1:61670/tog", | |||
| data=json.dumps(q)) | |||
| resume = resume.json()["response"]["results"] | |||
| resume = refactor(resume) | |||
| for k in ["education", "work", "project", | |||
| "training", "skill", "certificate", "language"]: | |||
| if not resume.get(k) and k in resume: | |||
| del resume[k] | |||
| resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x", | |||
| "updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}])) | |||
| resume = step_two.parse(resume) | |||
| return resume | |||
| except Exception as e: | |||
| cron_logger.error("Resume parser error: " + str(e)) | |||
| return {} | |||
| def chunk(filename, binary=None, callback=None, **kwargs): | |||
| """ | |||
| The supported file formats are pdf, docx and txt. | |||
| To maximize the effectiveness, parse the resume correctly, please concat us: https://github.com/infiniflow/ragflow | |||
| """ | |||
| if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE): | |||
| raise NotImplementedError("file type not supported yet(pdf supported)") | |||
| if not binary: | |||
| with open(filename, "rb") as f: | |||
| binary = f.read() | |||
| callback(0.2, "Resume parsing is going on...") | |||
| resume = remote_call(filename, binary) | |||
| if len(resume.keys()) < 7: | |||
| callback(-1, "Resume is not successfully parsed.") | |||
| raise Exception("Resume parser remote call fail!") | |||
| callback(0.6, "Done parsing. Chunking...") | |||
| print(json.dumps(resume, ensure_ascii=False, indent=2)) | |||
| field_map = { | |||
| "name_kwd": "姓名/名字", | |||
| "name_pinyin_kwd": "姓名拼音/名字拼音", | |||
| "gender_kwd": "性别(男,女)", | |||
| "age_int": "年龄/岁/年纪", | |||
| "phone_kwd": "电话/手机/微信", | |||
| "email_tks": "email/e-mail/邮箱", | |||
| "position_name_tks": "职位/职能/岗位/职责", | |||
| "expect_city_names_tks": "期望城市", | |||
| "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年", | |||
| "corporation_name_tks": "最近就职(上班)的公司/上一家公司", | |||
| "first_school_name_tks": "第一学历毕业学校", | |||
| "first_degree_kwd": "第一学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", | |||
| "highest_degree_kwd": "最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", | |||
| "first_major_tks": "第一学历专业", | |||
| "edu_first_fea_kwd": "第一学历标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)", | |||
| "degree_kwd": "过往学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", | |||
| "major_tks": "学过的专业/过往专业", | |||
| "school_name_tks": "学校/毕业院校", | |||
| "sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)", | |||
| "edu_fea_kwd": "教育标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)", | |||
| "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司", | |||
| "edu_end_int": "毕业年份", | |||
| "industry_name_tks": "所在行业", | |||
| "birth_dt": "生日/出生年份", | |||
| "expect_position_name_tks": "期望职位/期望职能/期望岗位", | |||
| } | |||
| titles = [] | |||
| for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]: | |||
| v = resume.get(n, "") | |||
| if isinstance(v, list): | |||
| v = v[0] | |||
| if n.find("tks") > 0: | |||
| v = rmSpace(v) | |||
| titles.append(str(v)) | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历") | |||
| } | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| pairs = [] | |||
| for n, m in field_map.items(): | |||
| if not resume.get(n): | |||
| continue | |||
| v = resume[n] | |||
| if isinstance(v, list): | |||
| v = " ".join(v) | |||
| if n.find("tks") > 0: | |||
| v = rmSpace(v) | |||
| pairs.append((m, str(v))) | |||
| doc["content_with_weight"] = "\n".join( | |||
| ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs]) | |||
| doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"]) | |||
| doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"]) | |||
| for n, _ in field_map.items(): | |||
| if n not in resume: | |||
| continue | |||
| if isinstance(resume[n], list) and ( | |||
| len(resume[n]) == 1 or n not in forbidden_select_fields4resume): | |||
| resume[n] = resume[n][0] | |||
| if n.find("_tks") > 0: | |||
| resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n]) | |||
| doc[n] = resume[n] | |||
| print(doc) | |||
| KnowledgebaseService.update_parser_config( | |||
| kwargs["kb_id"], {"field_map": field_map}) | |||
| return [doc] | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(a, b): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| @@ -1,252 +1,252 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from io import BytesIO | |||
| from xpinyin import Pinyin | |||
| import numpy as np | |||
| import pandas as pd | |||
| from openpyxl import load_workbook | |||
| from dateutil.parser import parse as datetime_parse | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | |||
| from deepdoc.parser import ExcelParser | |||
| class Excel(ExcelParser): | |||
| def __call__(self, fnm, binary=None, from_page=0, | |||
| to_page=10000000000, callback=None): | |||
| if not binary: | |||
| wb = load_workbook(fnm) | |||
| else: | |||
| wb = load_workbook(BytesIO(binary)) | |||
| total = 0 | |||
| for sheetname in wb.sheetnames: | |||
| total += len(list(wb[sheetname].rows)) | |||
| res, fails, done = [], [], 0 | |||
| rn = 0 | |||
| for sheetname in wb.sheetnames: | |||
| ws = wb[sheetname] | |||
| rows = list(ws.rows) | |||
| if not rows:continue | |||
| headers = [cell.value for cell in rows[0]] | |||
| missed = set([i for i, h in enumerate(headers) if h is None]) | |||
| headers = [ | |||
| cell.value for i, | |||
| cell in enumerate( | |||
| rows[0]) if i not in missed] | |||
| if not headers:continue | |||
| data = [] | |||
| for i, r in enumerate(rows[1:]): | |||
| rn += 1 | |||
| if rn - 1 < from_page: | |||
| continue | |||
| if rn - 1 >= to_page: | |||
| break | |||
| row = [ | |||
| cell.value for ii, | |||
| cell in enumerate(r) if ii not in missed] | |||
| if len(row) != len(headers): | |||
| fails.append(str(i)) | |||
| continue | |||
| data.append(row) | |||
| done += 1 | |||
| res.append(pd.DataFrame(np.array(data), columns=headers)) | |||
| callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| return res | |||
| def trans_datatime(s): | |||
| try: | |||
| return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S") | |||
| except Exception as e: | |||
| pass | |||
| def trans_bool(s): | |||
| if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", | |||
| str(s).strip(), flags=re.IGNORECASE): | |||
| return "yes" | |||
| if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE): | |||
| return "no" | |||
| def column_data_type(arr): | |||
| arr = list(arr) | |||
| uni = len(set([a for a in arr if a is not None])) | |||
| counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0} | |||
| trans = {t: f for f, t in | |||
| [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]} | |||
| for a in arr: | |||
| if a is None: | |||
| continue | |||
| if re.match(r"[+-]?[0-9]+(\.0+)?$", str(a).replace("%%", "")): | |||
| counts["int"] += 1 | |||
| elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")): | |||
| counts["float"] += 1 | |||
| elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE): | |||
| counts["bool"] += 1 | |||
| elif trans_datatime(str(a)): | |||
| counts["datetime"] += 1 | |||
| else: | |||
| counts["text"] += 1 | |||
| counts = sorted(counts.items(), key=lambda x: x[1] * -1) | |||
| ty = counts[0][0] | |||
| for i in range(len(arr)): | |||
| if arr[i] is None: | |||
| continue | |||
| try: | |||
| arr[i] = trans[ty](str(arr[i])) | |||
| except Exception as e: | |||
| arr[i] = None | |||
| # if ty == "text": | |||
| # if len(arr) > 128 and uni / len(arr) < 0.1: | |||
| # ty = "keyword" | |||
| return arr, ty | |||
| def chunk(filename, binary=None, from_page=0, to_page=10000000000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Excel and csv(txt) format files are supported. | |||
| For csv or txt file, the delimiter between columns is TAB. | |||
| The first line must be column headers. | |||
| Column headers must be meaningful terms inorder to make our NLP model understanding. | |||
| It's good to enumerate some synonyms using slash '/' to separate, and even better to | |||
| enumerate values using brackets like 'gender/sex(male, female)'. | |||
| Here are some examples for headers: | |||
| 1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL) | |||
| 2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA) | |||
| Every row in table will be treated as a chunk. | |||
| """ | |||
| if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = Excel() | |||
| dfs = excel_parser( | |||
| filename, | |||
| binary, | |||
| from_page=from_page, | |||
| to_page=to_page, | |||
| callback=callback) | |||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| lines = txt.split("\n") | |||
| fails = [] | |||
| headers = lines[0].split(kwargs.get("delimiter", "\t")) | |||
| rows = [] | |||
| for i, line in enumerate(lines[1:]): | |||
| if i < from_page: | |||
| continue | |||
| if i >= to_page: | |||
| break | |||
| row = [l for l in line.split(kwargs.get("delimiter", "\t"))] | |||
| if len(row) != len(headers): | |||
| fails.append(str(i)) | |||
| continue | |||
| rows.append(row) | |||
| callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| dfs = [pd.DataFrame(np.array(rows), columns=headers)] | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(excel, text, csv supported)") | |||
| res = [] | |||
| PY = Pinyin() | |||
| fieds_map = { | |||
| "text": "_tks", | |||
| "int": "_long", | |||
| "keyword": "_kwd", | |||
| "float": "_flt", | |||
| "datetime": "_dt", | |||
| "bool": "_kwd"} | |||
| for df in dfs: | |||
| for n in ["id", "_id", "index", "idx"]: | |||
| if n in df.columns: | |||
| del df[n] | |||
| clmns = df.columns.values | |||
| txts = list(copy.deepcopy(clmns)) | |||
| py_clmns = [ | |||
| PY.get_pinyins( | |||
| re.sub( | |||
| r"(/.*|([^()]+?)|\([^()]+?\))", | |||
| "", | |||
| str(n)), | |||
| '_')[0] for n in clmns] | |||
| clmn_tys = [] | |||
| for j in range(len(clmns)): | |||
| cln, ty = column_data_type(df[clmns[j]]) | |||
| clmn_tys.append(ty) | |||
| df[clmns[j]] = cln | |||
| if ty == "text": | |||
| txts.extend([str(c) for c in cln if c]) | |||
| clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) | |||
| for i in range(len(clmns))] | |||
| eng = lang.lower() == "english" # is_english(txts) | |||
| for ii, row in df.iterrows(): | |||
| d = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| row_txt = [] | |||
| for j in range(len(clmns)): | |||
| if row[clmns[j]] is None: | |||
| continue | |||
| if not str(row[clmns[j]]): | |||
| continue | |||
| if pd.isna(row[clmns[j]]): | |||
| continue | |||
| fld = clmns_map[j][0] | |||
| d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize( | |||
| row[clmns[j]]) | |||
| row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) | |||
| if not row_txt: | |||
| continue | |||
| tokenize(d, "; ".join(row_txt), eng) | |||
| res.append(d) | |||
| KnowledgebaseService.update_parser_config( | |||
| kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}}) | |||
| callback(0.35, "") | |||
| return res | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| import re | |||
| from io import BytesIO | |||
| from xpinyin import Pinyin | |||
| import numpy as np | |||
| import pandas as pd | |||
| from openpyxl import load_workbook | |||
| from dateutil.parser import parse as datetime_parse | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | |||
| from deepdoc.parser import ExcelParser | |||
| class Excel(ExcelParser): | |||
| def __call__(self, fnm, binary=None, from_page=0, | |||
| to_page=10000000000, callback=None): | |||
| if not binary: | |||
| wb = load_workbook(fnm) | |||
| else: | |||
| wb = load_workbook(BytesIO(binary)) | |||
| total = 0 | |||
| for sheetname in wb.sheetnames: | |||
| total += len(list(wb[sheetname].rows)) | |||
| res, fails, done = [], [], 0 | |||
| rn = 0 | |||
| for sheetname in wb.sheetnames: | |||
| ws = wb[sheetname] | |||
| rows = list(ws.rows) | |||
| if not rows:continue | |||
| headers = [cell.value for cell in rows[0]] | |||
| missed = set([i for i, h in enumerate(headers) if h is None]) | |||
| headers = [ | |||
| cell.value for i, | |||
| cell in enumerate( | |||
| rows[0]) if i not in missed] | |||
| if not headers:continue | |||
| data = [] | |||
| for i, r in enumerate(rows[1:]): | |||
| rn += 1 | |||
| if rn - 1 < from_page: | |||
| continue | |||
| if rn - 1 >= to_page: | |||
| break | |||
| row = [ | |||
| cell.value for ii, | |||
| cell in enumerate(r) if ii not in missed] | |||
| if len(row) != len(headers): | |||
| fails.append(str(i)) | |||
| continue | |||
| data.append(row) | |||
| done += 1 | |||
| res.append(pd.DataFrame(np.array(data), columns=headers)) | |||
| callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| return res | |||
| def trans_datatime(s): | |||
| try: | |||
| return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S") | |||
| except Exception as e: | |||
| pass | |||
| def trans_bool(s): | |||
| if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", | |||
| str(s).strip(), flags=re.IGNORECASE): | |||
| return "yes" | |||
| if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE): | |||
| return "no" | |||
| def column_data_type(arr): | |||
| arr = list(arr) | |||
| uni = len(set([a for a in arr if a is not None])) | |||
| counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0} | |||
| trans = {t: f for f, t in | |||
| [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]} | |||
| for a in arr: | |||
| if a is None: | |||
| continue | |||
| if re.match(r"[+-]?[0-9]+(\.0+)?$", str(a).replace("%%", "")): | |||
| counts["int"] += 1 | |||
| elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")): | |||
| counts["float"] += 1 | |||
| elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE): | |||
| counts["bool"] += 1 | |||
| elif trans_datatime(str(a)): | |||
| counts["datetime"] += 1 | |||
| else: | |||
| counts["text"] += 1 | |||
| counts = sorted(counts.items(), key=lambda x: x[1] * -1) | |||
| ty = counts[0][0] | |||
| for i in range(len(arr)): | |||
| if arr[i] is None: | |||
| continue | |||
| try: | |||
| arr[i] = trans[ty](str(arr[i])) | |||
| except Exception as e: | |||
| arr[i] = None | |||
| # if ty == "text": | |||
| # if len(arr) > 128 and uni / len(arr) < 0.1: | |||
| # ty = "keyword" | |||
| return arr, ty | |||
| def chunk(filename, binary=None, from_page=0, to_page=10000000000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| Excel and csv(txt) format files are supported. | |||
| For csv or txt file, the delimiter between columns is TAB. | |||
| The first line must be column headers. | |||
| Column headers must be meaningful terms inorder to make our NLP model understanding. | |||
| It's good to enumerate some synonyms using slash '/' to separate, and even better to | |||
| enumerate values using brackets like 'gender/sex(male, female)'. | |||
| Here are some examples for headers: | |||
| 1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL) | |||
| 2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA) | |||
| Every row in table will be treated as a chunk. | |||
| """ | |||
| if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = Excel() | |||
| dfs = excel_parser( | |||
| filename, | |||
| binary, | |||
| from_page=from_page, | |||
| to_page=to_page, | |||
| callback=callback) | |||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| lines = txt.split("\n") | |||
| fails = [] | |||
| headers = lines[0].split(kwargs.get("delimiter", "\t")) | |||
| rows = [] | |||
| for i, line in enumerate(lines[1:]): | |||
| if i < from_page: | |||
| continue | |||
| if i >= to_page: | |||
| break | |||
| row = [l for l in line.split(kwargs.get("delimiter", "\t"))] | |||
| if len(row) != len(headers): | |||
| fails.append(str(i)) | |||
| continue | |||
| rows.append(row) | |||
| callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| dfs = [pd.DataFrame(np.array(rows), columns=headers)] | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(excel, text, csv supported)") | |||
| res = [] | |||
| PY = Pinyin() | |||
| fieds_map = { | |||
| "text": "_tks", | |||
| "int": "_long", | |||
| "keyword": "_kwd", | |||
| "float": "_flt", | |||
| "datetime": "_dt", | |||
| "bool": "_kwd"} | |||
| for df in dfs: | |||
| for n in ["id", "_id", "index", "idx"]: | |||
| if n in df.columns: | |||
| del df[n] | |||
| clmns = df.columns.values | |||
| txts = list(copy.deepcopy(clmns)) | |||
| py_clmns = [ | |||
| PY.get_pinyins( | |||
| re.sub( | |||
| r"(/.*|([^()]+?)|\([^()]+?\))", | |||
| "", | |||
| str(n)), | |||
| '_')[0] for n in clmns] | |||
| clmn_tys = [] | |||
| for j in range(len(clmns)): | |||
| cln, ty = column_data_type(df[clmns[j]]) | |||
| clmn_tys.append(ty) | |||
| df[clmns[j]] = cln | |||
| if ty == "text": | |||
| txts.extend([str(c) for c in cln if c]) | |||
| clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) | |||
| for i in range(len(clmns))] | |||
| eng = lang.lower() == "english" # is_english(txts) | |||
| for ii, row in df.iterrows(): | |||
| d = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| row_txt = [] | |||
| for j in range(len(clmns)): | |||
| if row[clmns[j]] is None: | |||
| continue | |||
| if not str(row[clmns[j]]): | |||
| continue | |||
| if pd.isna(row[clmns[j]]): | |||
| continue | |||
| fld = clmns_map[j][0] | |||
| d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize( | |||
| row[clmns[j]]) | |||
| row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) | |||
| if not row_txt: | |||
| continue | |||
| tokenize(d, "; ".join(row_txt), eng) | |||
| res.append(d) | |||
| KnowledgebaseService.update_parser_config( | |||
| kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}}) | |||
| callback(0.35, "") | |||
| return res | |||
| if __name__ == "__main__": | |||
| import sys | |||
| def dummy(prog=None, msg=""): | |||
| pass | |||
| chunk(sys.argv[1], callback=dummy) | |||
| @@ -1,171 +1,171 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import argparse | |||
| import pickle | |||
| import random | |||
| import time | |||
| from copy import deepcopy | |||
| from multiprocessing.connection import Listener | |||
| from threading import Thread | |||
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer | |||
| def torch_gc(): | |||
| try: | |||
| import torch | |||
| if torch.cuda.is_available(): | |||
| # with torch.cuda.device(DEVICE): | |||
| torch.cuda.empty_cache() | |||
| torch.cuda.ipc_collect() | |||
| elif torch.backends.mps.is_available(): | |||
| try: | |||
| from torch.mps import empty_cache | |||
| empty_cache() | |||
| except Exception as e: | |||
| pass | |||
| except Exception: | |||
| pass | |||
| class RPCHandler: | |||
| def __init__(self): | |||
| self._functions = {} | |||
| def register_function(self, func): | |||
| self._functions[func.__name__] = func | |||
| def handle_connection(self, connection): | |||
| try: | |||
| while True: | |||
| # Receive a message | |||
| func_name, args, kwargs = pickle.loads(connection.recv()) | |||
| # Run the RPC and send a response | |||
| try: | |||
| r = self._functions[func_name](*args, **kwargs) | |||
| connection.send(pickle.dumps(r)) | |||
| except Exception as e: | |||
| connection.send(pickle.dumps(e)) | |||
| except EOFError: | |||
| pass | |||
| def rpc_server(hdlr, address, authkey): | |||
| sock = Listener(address, authkey=authkey) | |||
| while True: | |||
| try: | |||
| client = sock.accept() | |||
| t = Thread(target=hdlr.handle_connection, args=(client,)) | |||
| t.daemon = True | |||
| t.start() | |||
| except Exception as e: | |||
| print("【EXCEPTION】:", str(e)) | |||
| models = [] | |||
| tokenizer = None | |||
| def chat(messages, gen_conf): | |||
| global tokenizer | |||
| model = Model() | |||
| try: | |||
| torch_gc() | |||
| conf = { | |||
| "max_new_tokens": int( | |||
| gen_conf.get( | |||
| "max_tokens", 256)), "temperature": float( | |||
| gen_conf.get( | |||
| "temperature", 0.1))} | |||
| print(messages, conf) | |||
| text = tokenizer.apply_chat_template( | |||
| messages, | |||
| tokenize=False, | |||
| add_generation_prompt=True | |||
| ) | |||
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |||
| generated_ids = model.generate( | |||
| model_inputs.input_ids, | |||
| **conf | |||
| ) | |||
| generated_ids = [ | |||
| output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |||
| ] | |||
| return tokenizer.batch_decode( | |||
| generated_ids, skip_special_tokens=True)[0] | |||
| except Exception as e: | |||
| return str(e) | |||
| def chat_streamly(messages, gen_conf): | |||
| global tokenizer | |||
| model = Model() | |||
| try: | |||
| torch_gc() | |||
| conf = deepcopy(gen_conf) | |||
| print(messages, conf) | |||
| text = tokenizer.apply_chat_template( | |||
| messages, | |||
| tokenize=False, | |||
| add_generation_prompt=True | |||
| ) | |||
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |||
| streamer = TextStreamer(tokenizer) | |||
| conf["inputs"] = model_inputs.input_ids | |||
| conf["streamer"] = streamer | |||
| conf["max_new_tokens"] = conf["max_tokens"] | |||
| del conf["max_tokens"] | |||
| thread = Thread(target=model.generate, kwargs=conf) | |||
| thread.start() | |||
| for _, new_text in enumerate(streamer): | |||
| yield new_text | |||
| except Exception as e: | |||
| yield "**ERROR**: " + str(e) | |||
| def Model(): | |||
| global models | |||
| random.seed(time.time()) | |||
| return random.choice(models) | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument("--model_name", type=str, help="Model name") | |||
| parser.add_argument( | |||
| "--port", | |||
| default=7860, | |||
| type=int, | |||
| help="RPC serving port") | |||
| args = parser.parse_args() | |||
| handler = RPCHandler() | |||
| handler.register_function(chat) | |||
| handler.register_function(chat_streamly) | |||
| models = [] | |||
| for _ in range(1): | |||
| m = AutoModelForCausalLM.from_pretrained(args.model_name, | |||
| device_map="auto", | |||
| torch_dtype='auto') | |||
| models.append(m) | |||
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) | |||
| # Run the server | |||
| rpc_server(handler, ('0.0.0.0', args.port), | |||
| authkey=b'infiniflow-token4kevinhu') | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import argparse | |||
| import pickle | |||
| import random | |||
| import time | |||
| from copy import deepcopy | |||
| from multiprocessing.connection import Listener | |||
| from threading import Thread | |||
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer | |||
| def torch_gc(): | |||
| try: | |||
| import torch | |||
| if torch.cuda.is_available(): | |||
| # with torch.cuda.device(DEVICE): | |||
| torch.cuda.empty_cache() | |||
| torch.cuda.ipc_collect() | |||
| elif torch.backends.mps.is_available(): | |||
| try: | |||
| from torch.mps import empty_cache | |||
| empty_cache() | |||
| except Exception as e: | |||
| pass | |||
| except Exception: | |||
| pass | |||
| class RPCHandler: | |||
| def __init__(self): | |||
| self._functions = {} | |||
| def register_function(self, func): | |||
| self._functions[func.__name__] = func | |||
| def handle_connection(self, connection): | |||
| try: | |||
| while True: | |||
| # Receive a message | |||
| func_name, args, kwargs = pickle.loads(connection.recv()) | |||
| # Run the RPC and send a response | |||
| try: | |||
| r = self._functions[func_name](*args, **kwargs) | |||
| connection.send(pickle.dumps(r)) | |||
| except Exception as e: | |||
| connection.send(pickle.dumps(e)) | |||
| except EOFError: | |||
| pass | |||
| def rpc_server(hdlr, address, authkey): | |||
| sock = Listener(address, authkey=authkey) | |||
| while True: | |||
| try: | |||
| client = sock.accept() | |||
| t = Thread(target=hdlr.handle_connection, args=(client,)) | |||
| t.daemon = True | |||
| t.start() | |||
| except Exception as e: | |||
| print("【EXCEPTION】:", str(e)) | |||
| models = [] | |||
| tokenizer = None | |||
| def chat(messages, gen_conf): | |||
| global tokenizer | |||
| model = Model() | |||
| try: | |||
| torch_gc() | |||
| conf = { | |||
| "max_new_tokens": int( | |||
| gen_conf.get( | |||
| "max_tokens", 256)), "temperature": float( | |||
| gen_conf.get( | |||
| "temperature", 0.1))} | |||
| print(messages, conf) | |||
| text = tokenizer.apply_chat_template( | |||
| messages, | |||
| tokenize=False, | |||
| add_generation_prompt=True | |||
| ) | |||
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |||
| generated_ids = model.generate( | |||
| model_inputs.input_ids, | |||
| **conf | |||
| ) | |||
| generated_ids = [ | |||
| output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |||
| ] | |||
| return tokenizer.batch_decode( | |||
| generated_ids, skip_special_tokens=True)[0] | |||
| except Exception as e: | |||
| return str(e) | |||
| def chat_streamly(messages, gen_conf): | |||
| global tokenizer | |||
| model = Model() | |||
| try: | |||
| torch_gc() | |||
| conf = deepcopy(gen_conf) | |||
| print(messages, conf) | |||
| text = tokenizer.apply_chat_template( | |||
| messages, | |||
| tokenize=False, | |||
| add_generation_prompt=True | |||
| ) | |||
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |||
| streamer = TextStreamer(tokenizer) | |||
| conf["inputs"] = model_inputs.input_ids | |||
| conf["streamer"] = streamer | |||
| conf["max_new_tokens"] = conf["max_tokens"] | |||
| del conf["max_tokens"] | |||
| thread = Thread(target=model.generate, kwargs=conf) | |||
| thread.start() | |||
| for _, new_text in enumerate(streamer): | |||
| yield new_text | |||
| except Exception as e: | |||
| yield "**ERROR**: " + str(e) | |||
| def Model(): | |||
| global models | |||
| random.seed(time.time()) | |||
| return random.choice(models) | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument("--model_name", type=str, help="Model name") | |||
| parser.add_argument( | |||
| "--port", | |||
| default=7860, | |||
| type=int, | |||
| help="RPC serving port") | |||
| args = parser.parse_args() | |||
| handler = RPCHandler() | |||
| handler.register_function(chat) | |||
| handler.register_function(chat_streamly) | |||
| models = [] | |||
| for _ in range(1): | |||
| m = AutoModelForCausalLM.from_pretrained(args.model_name, | |||
| device_map="auto", | |||
| torch_dtype='auto') | |||
| models.append(m) | |||
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) | |||
| # Run the server | |||
| rpc_server(handler, ('0.0.0.0', args.port), | |||
| authkey=b'infiniflow-token4kevinhu') | |||
| @@ -1,89 +1,89 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from openai.lib.azure import AzureOpenAI | |||
| from zhipuai import ZhipuAI | |||
| import io | |||
| from abc import ABC | |||
| from ollama import Client | |||
| from openai import OpenAI | |||
| import os | |||
| import json | |||
| from rag.utils import num_tokens_from_string | |||
| class Base(ABC): | |||
| def __init__(self, key, model_name): | |||
| pass | |||
| def transcription(self, audio, **kwargs): | |||
| transcription = self.client.audio.transcriptions.create( | |||
| model=self.model_name, | |||
| file=audio, | |||
| response_format="text" | |||
| ) | |||
| return transcription.text.strip(), num_tokens_from_string(transcription.text.strip()) | |||
| class GPTSeq2txt(Base): | |||
| def __init__(self, key, model_name="whisper-1", base_url="https://api.openai.com/v1"): | |||
| if not base_url: base_url = "https://api.openai.com/v1" | |||
| self.client = OpenAI(api_key=key, base_url=base_url) | |||
| self.model_name = model_name | |||
| class QWenSeq2txt(Base): | |||
| def __init__(self, key, model_name="paraformer-realtime-8k-v1", **kwargs): | |||
| import dashscope | |||
| dashscope.api_key = key | |||
| self.model_name = model_name | |||
| def transcription(self, audio, format): | |||
| from http import HTTPStatus | |||
| from dashscope.audio.asr import Recognition | |||
| recognition = Recognition(model=self.model_name, | |||
| format=format, | |||
| sample_rate=16000, | |||
| callback=None) | |||
| result = recognition.call(audio) | |||
| ans = "" | |||
| if result.status_code == HTTPStatus.OK: | |||
| for sentence in result.get_sentence(): | |||
| ans += str(sentence + '\n') | |||
| return ans, num_tokens_from_string(ans) | |||
| return "**ERROR**: " + result.message, 0 | |||
| class OllamaSeq2txt(Base): | |||
| def __init__(self, key, model_name, lang="Chinese", **kwargs): | |||
| self.client = Client(host=kwargs["base_url"]) | |||
| self.model_name = model_name | |||
| self.lang = lang | |||
| class AzureSeq2txt(Base): | |||
| def __init__(self, key, model_name, lang="Chinese", **kwargs): | |||
| self.client = AzureOpenAI(api_key=key, azure_endpoint=kwargs["base_url"], api_version="2024-02-01") | |||
| self.model_name = model_name | |||
| self.lang = lang | |||
| class XinferenceSeq2txt(Base): | |||
| def __init__(self, key, model_name="", base_url=""): | |||
| self.client = OpenAI(api_key="xxx", base_url=base_url) | |||
| self.model_name = model_name | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from openai.lib.azure import AzureOpenAI | |||
| from zhipuai import ZhipuAI | |||
| import io | |||
| from abc import ABC | |||
| from ollama import Client | |||
| from openai import OpenAI | |||
| import os | |||
| import json | |||
| from rag.utils import num_tokens_from_string | |||
| class Base(ABC): | |||
| def __init__(self, key, model_name): | |||
| pass | |||
| def transcription(self, audio, **kwargs): | |||
| transcription = self.client.audio.transcriptions.create( | |||
| model=self.model_name, | |||
| file=audio, | |||
| response_format="text" | |||
| ) | |||
| return transcription.text.strip(), num_tokens_from_string(transcription.text.strip()) | |||
| class GPTSeq2txt(Base): | |||
| def __init__(self, key, model_name="whisper-1", base_url="https://api.openai.com/v1"): | |||
| if not base_url: base_url = "https://api.openai.com/v1" | |||
| self.client = OpenAI(api_key=key, base_url=base_url) | |||
| self.model_name = model_name | |||
| class QWenSeq2txt(Base): | |||
| def __init__(self, key, model_name="paraformer-realtime-8k-v1", **kwargs): | |||
| import dashscope | |||
| dashscope.api_key = key | |||
| self.model_name = model_name | |||
| def transcription(self, audio, format): | |||
| from http import HTTPStatus | |||
| from dashscope.audio.asr import Recognition | |||
| recognition = Recognition(model=self.model_name, | |||
| format=format, | |||
| sample_rate=16000, | |||
| callback=None) | |||
| result = recognition.call(audio) | |||
| ans = "" | |||
| if result.status_code == HTTPStatus.OK: | |||
| for sentence in result.get_sentence(): | |||
| ans += str(sentence + '\n') | |||
| return ans, num_tokens_from_string(ans) | |||
| return "**ERROR**: " + result.message, 0 | |||
| class OllamaSeq2txt(Base): | |||
| def __init__(self, key, model_name, lang="Chinese", **kwargs): | |||
| self.client = Client(host=kwargs["base_url"]) | |||
| self.model_name = model_name | |||
| self.lang = lang | |||
| class AzureSeq2txt(Base): | |||
| def __init__(self, key, model_name, lang="Chinese", **kwargs): | |||
| self.client = AzureOpenAI(api_key=key, azure_endpoint=kwargs["base_url"], api_version="2024-02-01") | |||
| self.model_name = model_name | |||
| self.lang = lang | |||
| class XinferenceSeq2txt(Base): | |||
| def __init__(self, key, model_name="", base_url=""): | |||
| self.client = OpenAI(api_key="xxx", base_url=base_url) | |||
| self.model_name = model_name | |||
| @@ -1,55 +1,55 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| from api.utils import get_base_config, decrypt_database_config | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from api.utils.log_utils import LoggerFactory, getLogger | |||
| # Server | |||
| RAG_CONF_PATH = os.path.join(get_project_base_directory(), "conf") | |||
| SUBPROCESS_STD_LOG_NAME = "std.log" | |||
| ES = get_base_config("es", {}) | |||
| MINIO = decrypt_database_config(name="minio") | |||
| try: | |||
| REDIS = decrypt_database_config(name="redis") | |||
| except Exception as e: | |||
| REDIS = {} | |||
| pass | |||
| DOC_MAXIMUM_SIZE = int(os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024)) | |||
| # Logger | |||
| LoggerFactory.set_directory( | |||
| os.path.join( | |||
| get_project_base_directory(), | |||
| "logs", | |||
| "rag")) | |||
| # {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0} | |||
| LoggerFactory.LEVEL = 30 | |||
| es_logger = getLogger("es") | |||
| minio_logger = getLogger("minio") | |||
| cron_logger = getLogger("cron_logger") | |||
| cron_logger.setLevel(20) | |||
| chunk_logger = getLogger("chunk_logger") | |||
| database_logger = getLogger("database") | |||
| SVR_QUEUE_NAME = "rag_flow_svr_queue" | |||
| SVR_QUEUE_RETENTION = 60*60 | |||
| SVR_QUEUE_MAX_LEN = 1024 | |||
| SVR_CONSUMER_NAME = "rag_flow_svr_consumer" | |||
| SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group" | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import os | |||
| from api.utils import get_base_config, decrypt_database_config | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from api.utils.log_utils import LoggerFactory, getLogger | |||
| # Server | |||
| RAG_CONF_PATH = os.path.join(get_project_base_directory(), "conf") | |||
| SUBPROCESS_STD_LOG_NAME = "std.log" | |||
| ES = get_base_config("es", {}) | |||
| MINIO = decrypt_database_config(name="minio") | |||
| try: | |||
| REDIS = decrypt_database_config(name="redis") | |||
| except Exception as e: | |||
| REDIS = {} | |||
| pass | |||
| DOC_MAXIMUM_SIZE = int(os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024)) | |||
| # Logger | |||
| LoggerFactory.set_directory( | |||
| os.path.join( | |||
| get_project_base_directory(), | |||
| "logs", | |||
| "rag")) | |||
| # {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0} | |||
| LoggerFactory.LEVEL = 30 | |||
| es_logger = getLogger("es") | |||
| minio_logger = getLogger("minio") | |||
| cron_logger = getLogger("cron_logger") | |||
| cron_logger.setLevel(20) | |||
| chunk_logger = getLogger("chunk_logger") | |||
| database_logger = getLogger("database") | |||
| SVR_QUEUE_NAME = "rag_flow_svr_queue" | |||
| SVR_QUEUE_RETENTION = 60*60 | |||
| SVR_QUEUE_MAX_LEN = 1024 | |||
| SVR_CONSUMER_NAME = "rag_flow_svr_consumer" | |||
| SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group" | |||
| @@ -1,59 +1,59 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import random | |||
| import time | |||
| import traceback | |||
| from api.db.db_models import close_connection | |||
| from api.db.services.task_service import TaskService | |||
| from rag.settings import cron_logger | |||
| from rag.utils.minio_conn import MINIO | |||
| from rag.utils.redis_conn import REDIS_CONN | |||
| def collect(): | |||
| doc_locations = TaskService.get_ongoing_doc_name() | |||
| print(doc_locations) | |||
| if len(doc_locations) == 0: | |||
| time.sleep(1) | |||
| return | |||
| return doc_locations | |||
| def main(): | |||
| locations = collect() | |||
| if not locations:return | |||
| print("TASKS:", len(locations)) | |||
| for kb_id, loc in locations: | |||
| try: | |||
| if REDIS_CONN.is_alive(): | |||
| try: | |||
| key = "{}/{}".format(kb_id, loc) | |||
| if REDIS_CONN.exist(key):continue | |||
| file_bin = MINIO.get(kb_id, loc) | |||
| REDIS_CONN.transaction(key, file_bin, 12 * 60) | |||
| cron_logger.info("CACHE: {}".format(loc)) | |||
| except Exception as e: | |||
| traceback.print_stack(e) | |||
| except Exception as e: | |||
| traceback.print_stack(e) | |||
| if __name__ == "__main__": | |||
| while True: | |||
| main() | |||
| close_connection() | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import random | |||
| import time | |||
| import traceback | |||
| from api.db.db_models import close_connection | |||
| from api.db.services.task_service import TaskService | |||
| from rag.settings import cron_logger | |||
| from rag.utils.minio_conn import MINIO | |||
| from rag.utils.redis_conn import REDIS_CONN | |||
| def collect(): | |||
| doc_locations = TaskService.get_ongoing_doc_name() | |||
| print(doc_locations) | |||
| if len(doc_locations) == 0: | |||
| time.sleep(1) | |||
| return | |||
| return doc_locations | |||
| def main(): | |||
| locations = collect() | |||
| if not locations:return | |||
| print("TASKS:", len(locations)) | |||
| for kb_id, loc in locations: | |||
| try: | |||
| if REDIS_CONN.is_alive(): | |||
| try: | |||
| key = "{}/{}".format(kb_id, loc) | |||
| if REDIS_CONN.exist(key):continue | |||
| file_bin = MINIO.get(kb_id, loc) | |||
| REDIS_CONN.transaction(key, file_bin, 12 * 60) | |||
| cron_logger.info("CACHE: {}".format(loc)) | |||
| except Exception as e: | |||
| traceback.print_stack(e) | |||
| except Exception as e: | |||
| traceback.print_stack(e) | |||
| if __name__ == "__main__": | |||
| while True: | |||
| main() | |||
| close_connection() | |||
| time.sleep(1) | |||
| @@ -1,80 +1,80 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import discord | |||
| import requests | |||
| import base64 | |||
| import asyncio | |||
| URL = '{YOUR_IP_ADDRESS:PORT}/v1/api/completion_aibotk' # Default: https://demo.ragflow.io/v1/api/completion_aibotk | |||
| JSON_DATA = { | |||
| "conversation_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxx", # Get conversation id from /api/new_conversation | |||
| "Authorization": "ragflow-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", # RAGFlow Assistant Chat Bot API Key | |||
| "word": "" # User question, don't need to initialize | |||
| } | |||
| DISCORD_BOT_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxx" #Get DISCORD_BOT_KEY from Discord Application | |||
| intents = discord.Intents.default() | |||
| intents.message_content = True | |||
| client = discord.Client(intents=intents) | |||
| @client.event | |||
| async def on_ready(): | |||
| print(f'We have logged in as {client.user}') | |||
| @client.event | |||
| async def on_message(message): | |||
| if message.author == client.user: | |||
| return | |||
| if client.user.mentioned_in(message): | |||
| if len(message.content.split('> ')) == 1: | |||
| await message.channel.send("Hi~ How can I help you? ") | |||
| else: | |||
| JSON_DATA['word']=message.content.split('> ')[1] | |||
| response = requests.post(URL, json=JSON_DATA) | |||
| response_data = response.json().get('data', []) | |||
| image_bool = False | |||
| for i in response_data: | |||
| if i['type'] == 1: | |||
| res = i['content'] | |||
| if i['type'] == 3: | |||
| image_bool = True | |||
| image_data = base64.b64decode(i['url']) | |||
| with open('tmp_image.png','wb') as file: | |||
| file.write(image_data) | |||
| image= discord.File('tmp_image.png') | |||
| await message.channel.send(f"{message.author.mention}{res}") | |||
| if image_bool: | |||
| await message.channel.send(file=image) | |||
| loop = asyncio.get_event_loop() | |||
| try: | |||
| loop.run_until_complete(client.start(DISCORD_BOT_KEY)) | |||
| except KeyboardInterrupt: | |||
| loop.run_until_complete(client.close()) | |||
| finally: | |||
| loop.close() | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import discord | |||
| import requests | |||
| import base64 | |||
| import asyncio | |||
| URL = '{YOUR_IP_ADDRESS:PORT}/v1/api/completion_aibotk' # Default: https://demo.ragflow.io/v1/api/completion_aibotk | |||
| JSON_DATA = { | |||
| "conversation_id": "xxxxxxxxxxxxxxxxxxxxxxxxxxx", # Get conversation id from /api/new_conversation | |||
| "Authorization": "ragflow-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx", # RAGFlow Assistant Chat Bot API Key | |||
| "word": "" # User question, don't need to initialize | |||
| } | |||
| DISCORD_BOT_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxx" #Get DISCORD_BOT_KEY from Discord Application | |||
| intents = discord.Intents.default() | |||
| intents.message_content = True | |||
| client = discord.Client(intents=intents) | |||
| @client.event | |||
| async def on_ready(): | |||
| print(f'We have logged in as {client.user}') | |||
| @client.event | |||
| async def on_message(message): | |||
| if message.author == client.user: | |||
| return | |||
| if client.user.mentioned_in(message): | |||
| if len(message.content.split('> ')) == 1: | |||
| await message.channel.send("Hi~ How can I help you? ") | |||
| else: | |||
| JSON_DATA['word']=message.content.split('> ')[1] | |||
| response = requests.post(URL, json=JSON_DATA) | |||
| response_data = response.json().get('data', []) | |||
| image_bool = False | |||
| for i in response_data: | |||
| if i['type'] == 1: | |||
| res = i['content'] | |||
| if i['type'] == 3: | |||
| image_bool = True | |||
| image_data = base64.b64decode(i['url']) | |||
| with open('tmp_image.png','wb') as file: | |||
| file.write(image_data) | |||
| image= discord.File('tmp_image.png') | |||
| await message.channel.send(f"{message.author.mention}{res}") | |||
| if image_bool: | |||
| await message.channel.send(file=image) | |||
| loop = asyncio.get_event_loop() | |||
| try: | |||
| loop.run_until_complete(client.start(DISCORD_BOT_KEY)) | |||
| except KeyboardInterrupt: | |||
| loop.run_until_complete(client.close()) | |||
| finally: | |||
| loop.close() | |||
| @@ -1,150 +1,150 @@ | |||
| import json | |||
| import redis | |||
| import logging | |||
| from rag import settings | |||
| from rag.utils import singleton | |||
| class Payload: | |||
| def __init__(self, consumer, queue_name, group_name, msg_id, message): | |||
| self.__consumer = consumer | |||
| self.__queue_name = queue_name | |||
| self.__group_name = group_name | |||
| self.__msg_id = msg_id | |||
| self.__message = json.loads(message['message']) | |||
| def ack(self): | |||
| try: | |||
| self.__consumer.xack(self.__queue_name, self.__group_name, self.__msg_id) | |||
| return True | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]ack" + str(self.__queue_name) + "||" + str(e)) | |||
| return False | |||
| def get_message(self): | |||
| return self.__message | |||
| @singleton | |||
| class RedisDB: | |||
| def __init__(self): | |||
| self.REDIS = None | |||
| self.config = settings.REDIS | |||
| self.__open__() | |||
| def __open__(self): | |||
| try: | |||
| self.REDIS = redis.StrictRedis(host=self.config["host"].split(":")[0], | |||
| port=int(self.config.get("host", ":6379").split(":")[1]), | |||
| db=int(self.config.get("db", 1)), | |||
| password=self.config.get("password"), | |||
| decode_responses=True) | |||
| except Exception as e: | |||
| logging.warning("Redis can't be connected.") | |||
| return self.REDIS | |||
| def health(self): | |||
| self.REDIS.ping() | |||
| a, b = 'xx', 'yy' | |||
| self.REDIS.set(a, b, 3) | |||
| if self.REDIS.get(a) == b: | |||
| return True | |||
| def is_alive(self): | |||
| return self.REDIS is not None | |||
| def exist(self, k): | |||
| if not self.REDIS: return | |||
| try: | |||
| return self.REDIS.exists(k) | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]exist" + str(k) + "||" + str(e)) | |||
| self.__open__() | |||
| def get(self, k): | |||
| if not self.REDIS: return | |||
| try: | |||
| return self.REDIS.get(k) | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]get" + str(k) + "||" + str(e)) | |||
| self.__open__() | |||
| def set_obj(self, k, obj, exp=3600): | |||
| try: | |||
| self.REDIS.set(k, json.dumps(obj, ensure_ascii=False), exp) | |||
| return True | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]set_obj" + str(k) + "||" + str(e)) | |||
| self.__open__() | |||
| return False | |||
| def set(self, k, v, exp=3600): | |||
| try: | |||
| self.REDIS.set(k, v, exp) | |||
| return True | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]set" + str(k) + "||" + str(e)) | |||
| self.__open__() | |||
| return False | |||
| def transaction(self, key, value, exp=3600): | |||
| try: | |||
| pipeline = self.REDIS.pipeline(transaction=True) | |||
| pipeline.set(key, value, exp, nx=True) | |||
| pipeline.execute() | |||
| return True | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]set" + str(key) + "||" + str(e)) | |||
| self.__open__() | |||
| return False | |||
| def queue_product(self, queue, message, exp=settings.SVR_QUEUE_RETENTION) -> bool: | |||
| for _ in range(3): | |||
| try: | |||
| payload = {"message": json.dumps(message)} | |||
| pipeline = self.REDIS.pipeline() | |||
| pipeline.xadd(queue, payload) | |||
| pipeline.expire(queue, exp) | |||
| pipeline.execute() | |||
| return True | |||
| except Exception as e: | |||
| print(e) | |||
| logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e)) | |||
| return False | |||
| def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload: | |||
| try: | |||
| group_info = self.REDIS.xinfo_groups(queue_name) | |||
| if not any(e["name"] == group_name for e in group_info): | |||
| self.REDIS.xgroup_create( | |||
| queue_name, | |||
| group_name, | |||
| id="0", | |||
| mkstream=True | |||
| ) | |||
| args = { | |||
| "groupname": group_name, | |||
| "consumername": consumer_name, | |||
| "count": 1, | |||
| "block": 10000, | |||
| "streams": {queue_name: msg_id}, | |||
| } | |||
| messages = self.REDIS.xreadgroup(**args) | |||
| if not messages: | |||
| return None | |||
| stream, element_list = messages[0] | |||
| msg_id, payload = element_list[0] | |||
| res = Payload(self.REDIS, queue_name, group_name, msg_id, payload) | |||
| return res | |||
| except Exception as e: | |||
| if 'key' in str(e): | |||
| pass | |||
| else: | |||
| logging.warning("[EXCEPTION]consumer" + str(queue_name) + "||" + str(e)) | |||
| return None | |||
| REDIS_CONN = RedisDB() | |||
| import json | |||
| import redis | |||
| import logging | |||
| from rag import settings | |||
| from rag.utils import singleton | |||
| class Payload: | |||
| def __init__(self, consumer, queue_name, group_name, msg_id, message): | |||
| self.__consumer = consumer | |||
| self.__queue_name = queue_name | |||
| self.__group_name = group_name | |||
| self.__msg_id = msg_id | |||
| self.__message = json.loads(message['message']) | |||
| def ack(self): | |||
| try: | |||
| self.__consumer.xack(self.__queue_name, self.__group_name, self.__msg_id) | |||
| return True | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]ack" + str(self.__queue_name) + "||" + str(e)) | |||
| return False | |||
| def get_message(self): | |||
| return self.__message | |||
| @singleton | |||
| class RedisDB: | |||
| def __init__(self): | |||
| self.REDIS = None | |||
| self.config = settings.REDIS | |||
| self.__open__() | |||
| def __open__(self): | |||
| try: | |||
| self.REDIS = redis.StrictRedis(host=self.config["host"].split(":")[0], | |||
| port=int(self.config.get("host", ":6379").split(":")[1]), | |||
| db=int(self.config.get("db", 1)), | |||
| password=self.config.get("password"), | |||
| decode_responses=True) | |||
| except Exception as e: | |||
| logging.warning("Redis can't be connected.") | |||
| return self.REDIS | |||
| def health(self): | |||
| self.REDIS.ping() | |||
| a, b = 'xx', 'yy' | |||
| self.REDIS.set(a, b, 3) | |||
| if self.REDIS.get(a) == b: | |||
| return True | |||
| def is_alive(self): | |||
| return self.REDIS is not None | |||
| def exist(self, k): | |||
| if not self.REDIS: return | |||
| try: | |||
| return self.REDIS.exists(k) | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]exist" + str(k) + "||" + str(e)) | |||
| self.__open__() | |||
| def get(self, k): | |||
| if not self.REDIS: return | |||
| try: | |||
| return self.REDIS.get(k) | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]get" + str(k) + "||" + str(e)) | |||
| self.__open__() | |||
| def set_obj(self, k, obj, exp=3600): | |||
| try: | |||
| self.REDIS.set(k, json.dumps(obj, ensure_ascii=False), exp) | |||
| return True | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]set_obj" + str(k) + "||" + str(e)) | |||
| self.__open__() | |||
| return False | |||
| def set(self, k, v, exp=3600): | |||
| try: | |||
| self.REDIS.set(k, v, exp) | |||
| return True | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]set" + str(k) + "||" + str(e)) | |||
| self.__open__() | |||
| return False | |||
| def transaction(self, key, value, exp=3600): | |||
| try: | |||
| pipeline = self.REDIS.pipeline(transaction=True) | |||
| pipeline.set(key, value, exp, nx=True) | |||
| pipeline.execute() | |||
| return True | |||
| except Exception as e: | |||
| logging.warning("[EXCEPTION]set" + str(key) + "||" + str(e)) | |||
| self.__open__() | |||
| return False | |||
| def queue_product(self, queue, message, exp=settings.SVR_QUEUE_RETENTION) -> bool: | |||
| for _ in range(3): | |||
| try: | |||
| payload = {"message": json.dumps(message)} | |||
| pipeline = self.REDIS.pipeline() | |||
| pipeline.xadd(queue, payload) | |||
| pipeline.expire(queue, exp) | |||
| pipeline.execute() | |||
| return True | |||
| except Exception as e: | |||
| print(e) | |||
| logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e)) | |||
| return False | |||
| def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload: | |||
| try: | |||
| group_info = self.REDIS.xinfo_groups(queue_name) | |||
| if not any(e["name"] == group_name for e in group_info): | |||
| self.REDIS.xgroup_create( | |||
| queue_name, | |||
| group_name, | |||
| id="0", | |||
| mkstream=True | |||
| ) | |||
| args = { | |||
| "groupname": group_name, | |||
| "consumername": consumer_name, | |||
| "count": 1, | |||
| "block": 10000, | |||
| "streams": {queue_name: msg_id}, | |||
| } | |||
| messages = self.REDIS.xreadgroup(**args) | |||
| if not messages: | |||
| return None | |||
| stream, element_list = messages[0] | |||
| msg_id, payload = element_list[0] | |||
| res = Payload(self.REDIS, queue_name, group_name, msg_id, payload) | |||
| return res | |||
| except Exception as e: | |||
| if 'key' in str(e): | |||
| pass | |||
| else: | |||
| logging.warning("[EXCEPTION]consumer" + str(queue_name) + "||" + str(e)) | |||
| return None | |||
| REDIS_CONN = RedisDB() | |||
| @@ -1,9 +1,9 @@ | |||
| /node_modules | |||
| /.env.local | |||
| /.umirc.local.ts | |||
| /config/config.local.ts | |||
| /src/.umi/* | |||
| /src/.umi-production/* | |||
| /src/.umi-test | |||
| /dist | |||
| .swc | |||
| /node_modules | |||
| /.env.local | |||
| /.umirc.local.ts | |||
| /config/config.local.ts | |||
| /src/.umi/* | |||
| /src/.umi-production/* | |||
| /src/.umi-test | |||
| /dist | |||
| .swc | |||
| @@ -1,2 +1,2 @@ | |||
| registry=https://registry.npmmirror.com/ | |||
| registry=https://registry.npmmirror.com/ | |||
| @@ -1,27 +1,26 @@ | |||
| import React, { useReducer } from 'react' | |||
| const CHANGE_LOCALE = 'CHANGE_LOCALE' | |||
| const mainContext = React.createContext() | |||
| const reducer = (state, action) => { | |||
| switch (action.type) { | |||
| case CHANGE_LOCALE: | |||
| return { ...state, locale: action.locale || 'zh' } | |||
| default: | |||
| return state | |||
| } | |||
| } | |||
| const ContextProvider = (props) => { | |||
| const [state, dispatch] = useReducer(reducer, { | |||
| locale: 'zh' | |||
| }) | |||
| return ( | |||
| <mainContext.Provider value={{ state, dispatch }}> | |||
| {props.children} | |||
| </mainContext.Provider> | |||
| ) | |||
| } | |||
| export { reducer, mainContext, ContextProvider } | |||
| import React, { useReducer } from 'react'; | |||
| const CHANGE_LOCALE = 'CHANGE_LOCALE'; | |||
| const mainContext = React.createContext(); | |||
| const reducer = (state, action) => { | |||
| switch (action.type) { | |||
| case CHANGE_LOCALE: | |||
| return { ...state, locale: action.locale || 'zh' }; | |||
| default: | |||
| return state; | |||
| } | |||
| }; | |||
| const ContextProvider = (props) => { | |||
| const [state, dispatch] = useReducer(reducer, { | |||
| locale: 'zh', | |||
| }); | |||
| return ( | |||
| <mainContext.Provider value={{ state, dispatch }}> | |||
| {props.children} | |||
| </mainContext.Provider> | |||
| ); | |||
| }; | |||
| export { ContextProvider, mainContext, reducer }; | |||
| @@ -1,114 +1,114 @@ | |||
| <?xml version="1.0" encoding="utf-8"?> | |||
| <!-- Generator: Adobe Illustrator 28.2.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) --> | |||
| <svg version="1.0" id="katman_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" | |||
| viewBox="0 0 1589 1092" style="enable-background:new 0 0 1589 1092;" xml:space="preserve"> | |||
| <style type="text/css"> | |||
| .st0{fill:#8779CD;} | |||
| .st1{fill:#4991E7;} | |||
| .st2{fill:#8A78CB;} | |||
| .st3{fill:url(#SVGID_1_);} | |||
| .st4{fill:#4D8BEB;} | |||
| .st5{fill:#7F7BD1;} | |||
| .st6{fill:url(#SVGID_00000002374047799971512340000007471530466755245738_);} | |||
| .st7{fill:url(#SVGID_00000157275849013902826440000016458342546856776875_);} | |||
| </style> | |||
| <g> | |||
| <path class="st0" d="M1124.57,373.55c0.31,0,0.32-0.07,0.05-0.2c-0.17-0.08-0.34-0.12-0.52-0.12 | |||
| c-23.87-0.46-44.64-8.81-62.33-25.04c-18.73-17.18-30.31-42.27-29.75-68.18c0-0.04-0.02-0.06-0.06-0.06 | |||
| c-0.15-0.01-0.17-0.01-0.04,0c0.04,0.01,0.06,0.03,0.05,0.07c-0.56,6.79-1.25,12.28-2.08,16.45c-5.2,26.17-18.72,46.59-40.55,61.26 | |||
| c-15.05,10.11-31.88,15.26-50.49,15.47c-0.11,0-0.16,0.05-0.16,0.16v0.01c0,0.11,0.06,0.17,0.17,0.17 | |||
| c27.03-0.05,54.53,13.25,71.42,34.26c9.03,11.23,15.25,23.74,18.65,37.52c0.81,3.27,1.43,6.72,1.86,10.34 | |||
| c0.45,3.77,0.81,7.31,1.07,10.64c0.07,0.83,0.11,0.83,0.13-0.01c0.12-5.43,0.28-10.69,1.22-15.96 | |||
| c6.06-33.79,29.86-60.29,61.88-71.75C1104.58,375.18,1114.41,373.5,1124.57,373.55z"/> | |||
| </g> | |||
| <g> | |||
| <path class="st1" d="M468.99,570.58H323.75h0c-0.29,0-0.53,0.24-0.53,0.53l0.01,33.95c0,0.43,0.35,0.77,0.78,0.77h108.5 | |||
| c0.47,0,0.68,0.23,0.65,0.69c-1.24,15.39-4.56,28.52-9.97,39.41c-13.71,27.61-36.17,45.26-67.38,52.94 | |||
| c-12.12,2.98-24.87,4.19-38.26,3.62c-23.38-0.99-44.83-8.27-64.36-21.86c-27.04-18.83-44.26-49.58-48.13-82.08 | |||
| c-1.91-16-1.38-31.61,1.59-46.82c4.5-23.09,16.19-44.7,33.49-61.05c19.55-18.48,43.26-29.07,71.13-31.76 | |||
| c34.53-3.33,72.86,8.95,95.88,35.39c0.27,0.31,0.54,0.31,0.83,0.02l25.75-26.48c0.29-0.3,0.28-0.58-0.05-0.84 | |||
| c-1.89-1.49-3.22-3.46-4.97-5.13c-8.05-7.73-16.45-14.07-25.19-19.02c-27.14-15.33-58.47-22.05-89.79-20.37 | |||
| c-26.99,1.44-51.79,9.13-74.41,23.07c-25.29,15.59-44.66,36.97-58.1,64.14c-13.12,26.53-17.74,56.08-15.28,85.68 | |||
| c2.32,27.87,11.53,53.36,27.62,76.45c26.8,38.46,68.51,62.31,115.38,65.98c48.55,3.81,97.2-11.31,129.15-49.08 | |||
| c15.45-18.27,25.56-39.58,30.35-63.93c1.26-6.41,2.15-13.18,2.67-20.31c0.84-11.31,0.24-22.53-1.81-33.65 | |||
| C469.27,570.69,469.14,570.58,468.99,570.58z"/> | |||
| </g> | |||
| <g> | |||
| <circle class="st2" cx="1108.46" cy="451.38" r="26.99"/> | |||
| </g> | |||
| <g> | |||
| <linearGradient id="SVGID_1_" gradientUnits="userSpaceOnUse" x1="1373.5259" y1="451.3777" x2="1427.4858" y2="451.3777"> | |||
| <stop offset="0" style="stop-color:#439DDF"/> | |||
| <stop offset="0" style="stop-color:#4F87ED"/> | |||
| <stop offset="0" style="stop-color:#9476C5"/> | |||
| <stop offset="0" style="stop-color:#BC688E"/> | |||
| <stop offset="1" style="stop-color:#D6645D"/> | |||
| </linearGradient> | |||
| <circle class="st3" cx="1400.51" cy="451.38" r="26.98"/> | |||
| </g> | |||
| <g> | |||
| <g> | |||
| <path class="st4" d="M614.94,510.07c-27.34-3.09-53.3,2.03-75.45,18.67c-27.64,20.76-42.19,52.35-44.27,86.89 | |||
| c-0.61,10.28-0.17,20.38,1.33,30.3c3.69,24.45,13.67,44.97,29.94,61.57c25.12,25.64,60.04,34.54,95.3,29.6 | |||
| c11.11-1.56,20.53-4.19,28.26-7.89c21.5-10.29,37.89-26.02,49.17-47.19c0.01-0.01,0.01-0.02,0.01-0.03 | |||
| c0.11-0.22,0.01-0.48-0.21-0.59l-31.42-14.87c-0.03-0.01-0.06-0.03-0.1-0.04c-0.42-0.14-0.87,0.09-1.01,0.52 | |||
| c-0.12,0.37-0.28,0.72-0.47,1.06c-9.29,15.92-25.76,30.49-44.18,34.45c-9.83,2.11-19.13,2.43-27.88,0.97 | |||
| c-30.48-5.08-53.56-27.7-59.25-58.04c-0.88-4.72-1.45-9.12-1.7-13.2c-0.03-0.45,0.19-0.67,0.64-0.67H702.1 | |||
| c0.45,0,0.7-0.23,0.74-0.68c2.69-28.85-3.42-58.64-20.13-82.12C666.94,526.62,642.21,513.15,614.94,510.07z M663,600.58H535.82 | |||
| c-0.43,0-0.6-0.21-0.51-0.64c2.95-13.33,8.25-24.64,15.9-33.91c10.88-13.18,26.74-21.54,43.93-22.57 | |||
| c3.74-0.22,7.72-0.21,11.93,0.04c16.35,0.95,32.82,8.76,43.04,21.59c7.82,9.8,12.29,21.44,13.42,34.91 | |||
| C663.56,600.38,663.38,600.58,663,600.58z"/> | |||
| </g> | |||
| </g> | |||
| <g> | |||
| <path class="st5" d="M1054.69,576.29c-1.93-16.86-8.45-33.49-19.59-46.27c-9.62-11.03-23.29-17.2-37.81-19.46 | |||
| c-11.6-1.81-23.18-1.75-34.74,0.18c-7.59,1.33-14.15,3.4-19.66,6.2c-15.08,7.65-27.25,18.71-36.5,33.2 | |||
| c-0.37,0.57-0.66,0.54-0.87-0.1c-0.63-1.85-1.42-3.65-2.38-5.41c-8.86-16.26-25.41-28.81-43.44-33.15 | |||
| c-13.41-3.23-26.6-2.98-39.55,0.73c-17.31,5.21-31.41,14.86-42.31,28.93c-1.33,1.71-2.6,3.7-3.8,5.96 | |||
| c-0.16,0.29-0.41,0.45-0.74,0.48l-0.5,0.04c-0.38,0.03-0.57-0.14-0.57-0.52l0.02-30.78c0,0,0-0.01,0-0.01 | |||
| c0-0.13-0.1-0.23-0.23-0.23h-35.5c-0.42,0-0.76,0.34-0.76,0.76l0.01,214.35c0,0.25,0.12,0.38,0.37,0.38l37.37,0.01 | |||
| c0.37,0,0.55-0.18,0.56-0.55c0.03-37.07,0-75.86-0.09-116.39c-0.02-6.81,0.32-12.29,1.01-16.44c4.42-26.52,23.44-53.23,52.48-54.48 | |||
| c24.5-0.56,42.87,10.8,47.47,35.65c1.19,6.43,1.79,12.91,1.8,19.46c0.06,42.99,0.08,87.05,0.05,132.2c0,0.36,0.18,0.54,0.53,0.54 | |||
| l36.76,0.01c0.23,0,0.42-0.19,0.42-0.42c0.09-37.85,0.07-75.53-0.04-113.04c-0.03-8.1,0.3-14.47,0.98-19.11 | |||
| c1.67-11.49,5.87-22.17,12.59-32.03c3.2-4.71,7.28-9.01,12.24-12.91c9.58-7.53,20.51-10.95,32.79-10.28 | |||
| c13.51,0.18,26.8,5.06,35.04,15.92c7.31,9.65,9.7,24.58,9.73,36.42c0.1,41.75,0.11,86.68,0.04,134.79c0,0.43,0.21,0.65,0.64,0.65 | |||
| l36.52,0.01c0.4,0,0.6-0.2,0.6-0.6c-0.11-43.76-0.11-88.64,0.02-134.65C1055.67,588.72,1055.35,582.04,1054.69,576.29z"/> | |||
| </g> | |||
| <g> | |||
| <linearGradient id="SVGID_00000006692382290725070250000008342888873359191228_" gradientUnits="userSpaceOnUse" x1="1162.6759" y1="620.3867" x2="1350.1307" y2="620.3867"> | |||
| <stop offset="0" style="stop-color:#439DDF"/> | |||
| <stop offset="0" style="stop-color:#4F87ED"/> | |||
| <stop offset="0" style="stop-color:#9177C7"/> | |||
| <stop offset="0.7815" style="stop-color:#9476C5"/> | |||
| <stop offset="0.8883" style="stop-color:#BC688E"/> | |||
| <stop offset="1" style="stop-color:#D6645D"/> | |||
| </linearGradient> | |||
| <path style="fill:url(#SVGID_00000006692382290725070250000008342888873359191228_);" d="M1341,549.28 | |||
| c-10.36-21.4-28.17-34.24-51.19-38.36c-26.08-4.67-51.48-0.1-72.37,16.89c-6.89,5.26-12.22,11.18-15.98,17.77 | |||
| c-1.85,3.25-2.78,3.01-2.78-0.73l0.01-28.19c0-0.39-0.19-0.58-0.58-0.58h-35.08c-0.19,0-0.35,0.16-0.35,0.36V730.9 | |||
| c0,0.45,0.22,0.68,0.67,0.68l37.12-0.01c0.34,0,0.51-0.17,0.51-0.51c0.07-38.4,0.06-77.08-0.03-116.03 | |||
| c-0.02-7.34,0.45-13.46,1.4-18.35c3.01-15.38,10.38-28.53,22.11-39.45c1.76-1.37,3.5-2.7,5.22-3.97 | |||
| c11.17-8.28,23.33-10.43,36.92-9.26c16.58,1.43,33.15,9.83,39.82,25.25c3.53,8.16,5.3,17.25,5.32,27.28 | |||
| c0.07,43.53,0.09,88.45,0.04,134.74c0,0.21,0.1,0.31,0.3,0.31h37.45c0.37,0,0.56-0.19,0.56-0.57c0.07-46.67,0.06-93.28-0.05-139.83 | |||
| C1350,575.62,1346.98,561.65,1341,549.28z M1217.67,529.37c0.01,0,0.03,0,0.04,0c0.02,0.01,0.04,0.02,0.06,0.02 | |||
| C1217.73,529.38,1217.7,529.37,1217.67,529.37z"/> | |||
| </g> | |||
| <g> | |||
| <path class="st2" d="M1127.23,516.08h-37.32c-0.15,0-0.27,0.12-0.27,0.27v214.96c0,0.15,0.12,0.27,0.27,0.27h37.32 | |||
| c0.15,0,0.27-0.12,0.27-0.27V516.35C1127.5,516.2,1127.38,516.08,1127.23,516.08z"/> | |||
| </g> | |||
| <g> | |||
| <linearGradient id="SVGID_00000158714738904643395990000007397907749964941716_" gradientUnits="userSpaceOnUse" x1="1381.8658" y1="623.8276" x2="1419.6459" y2="623.8276"> | |||
| <stop offset="0" style="stop-color:#439DDF"/> | |||
| <stop offset="0" style="stop-color:#4F87ED"/> | |||
| <stop offset="0" style="stop-color:#9476C5"/> | |||
| <stop offset="0" style="stop-color:#BC688E"/> | |||
| <stop offset="1" style="stop-color:#D6645D"/> | |||
| </linearGradient> | |||
| <path style="fill:url(#SVGID_00000158714738904643395990000007397907749964941716_);" d="M1419.38,516.08h-37.24 | |||
| c-0.15,0-0.27,0.12-0.27,0.27v214.96c0,0.15,0.12,0.27,0.27,0.27h37.24c0.15,0,0.27-0.12,0.27-0.27V516.35 | |||
| C1419.65,516.2,1419.53,516.08,1419.38,516.08z"/> | |||
| </g> | |||
| </svg> | |||
| <?xml version="1.0" encoding="utf-8"?> | |||
| <!-- Generator: Adobe Illustrator 28.2.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) --> | |||
| <svg version="1.0" id="katman_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" | |||
| viewBox="0 0 1589 1092" style="enable-background:new 0 0 1589 1092;" xml:space="preserve"> | |||
| <style type="text/css"> | |||
| .st0{fill:#8779CD;} | |||
| .st1{fill:#4991E7;} | |||
| .st2{fill:#8A78CB;} | |||
| .st3{fill:url(#SVGID_1_);} | |||
| .st4{fill:#4D8BEB;} | |||
| .st5{fill:#7F7BD1;} | |||
| .st6{fill:url(#SVGID_00000002374047799971512340000007471530466755245738_);} | |||
| .st7{fill:url(#SVGID_00000157275849013902826440000016458342546856776875_);} | |||
| </style> | |||
| <g> | |||
| <path class="st0" d="M1124.57,373.55c0.31,0,0.32-0.07,0.05-0.2c-0.17-0.08-0.34-0.12-0.52-0.12 | |||
| c-23.87-0.46-44.64-8.81-62.33-25.04c-18.73-17.18-30.31-42.27-29.75-68.18c0-0.04-0.02-0.06-0.06-0.06 | |||
| c-0.15-0.01-0.17-0.01-0.04,0c0.04,0.01,0.06,0.03,0.05,0.07c-0.56,6.79-1.25,12.28-2.08,16.45c-5.2,26.17-18.72,46.59-40.55,61.26 | |||
| c-15.05,10.11-31.88,15.26-50.49,15.47c-0.11,0-0.16,0.05-0.16,0.16v0.01c0,0.11,0.06,0.17,0.17,0.17 | |||
| c27.03-0.05,54.53,13.25,71.42,34.26c9.03,11.23,15.25,23.74,18.65,37.52c0.81,3.27,1.43,6.72,1.86,10.34 | |||
| c0.45,3.77,0.81,7.31,1.07,10.64c0.07,0.83,0.11,0.83,0.13-0.01c0.12-5.43,0.28-10.69,1.22-15.96 | |||
| c6.06-33.79,29.86-60.29,61.88-71.75C1104.58,375.18,1114.41,373.5,1124.57,373.55z"/> | |||
| </g> | |||
| <g> | |||
| <path class="st1" d="M468.99,570.58H323.75h0c-0.29,0-0.53,0.24-0.53,0.53l0.01,33.95c0,0.43,0.35,0.77,0.78,0.77h108.5 | |||
| c0.47,0,0.68,0.23,0.65,0.69c-1.24,15.39-4.56,28.52-9.97,39.41c-13.71,27.61-36.17,45.26-67.38,52.94 | |||
| c-12.12,2.98-24.87,4.19-38.26,3.62c-23.38-0.99-44.83-8.27-64.36-21.86c-27.04-18.83-44.26-49.58-48.13-82.08 | |||
| c-1.91-16-1.38-31.61,1.59-46.82c4.5-23.09,16.19-44.7,33.49-61.05c19.55-18.48,43.26-29.07,71.13-31.76 | |||
| c34.53-3.33,72.86,8.95,95.88,35.39c0.27,0.31,0.54,0.31,0.83,0.02l25.75-26.48c0.29-0.3,0.28-0.58-0.05-0.84 | |||
| c-1.89-1.49-3.22-3.46-4.97-5.13c-8.05-7.73-16.45-14.07-25.19-19.02c-27.14-15.33-58.47-22.05-89.79-20.37 | |||
| c-26.99,1.44-51.79,9.13-74.41,23.07c-25.29,15.59-44.66,36.97-58.1,64.14c-13.12,26.53-17.74,56.08-15.28,85.68 | |||
| c2.32,27.87,11.53,53.36,27.62,76.45c26.8,38.46,68.51,62.31,115.38,65.98c48.55,3.81,97.2-11.31,129.15-49.08 | |||
| c15.45-18.27,25.56-39.58,30.35-63.93c1.26-6.41,2.15-13.18,2.67-20.31c0.84-11.31,0.24-22.53-1.81-33.65 | |||
| C469.27,570.69,469.14,570.58,468.99,570.58z"/> | |||
| </g> | |||
| <g> | |||
| <circle class="st2" cx="1108.46" cy="451.38" r="26.99"/> | |||
| </g> | |||
| <g> | |||
| <linearGradient id="SVGID_1_" gradientUnits="userSpaceOnUse" x1="1373.5259" y1="451.3777" x2="1427.4858" y2="451.3777"> | |||
| <stop offset="0" style="stop-color:#439DDF"/> | |||
| <stop offset="0" style="stop-color:#4F87ED"/> | |||
| <stop offset="0" style="stop-color:#9476C5"/> | |||
| <stop offset="0" style="stop-color:#BC688E"/> | |||
| <stop offset="1" style="stop-color:#D6645D"/> | |||
| </linearGradient> | |||
| <circle class="st3" cx="1400.51" cy="451.38" r="26.98"/> | |||
| </g> | |||
| <g> | |||
| <g> | |||
| <path class="st4" d="M614.94,510.07c-27.34-3.09-53.3,2.03-75.45,18.67c-27.64,20.76-42.19,52.35-44.27,86.89 | |||
| c-0.61,10.28-0.17,20.38,1.33,30.3c3.69,24.45,13.67,44.97,29.94,61.57c25.12,25.64,60.04,34.54,95.3,29.6 | |||
| c11.11-1.56,20.53-4.19,28.26-7.89c21.5-10.29,37.89-26.02,49.17-47.19c0.01-0.01,0.01-0.02,0.01-0.03 | |||
| c0.11-0.22,0.01-0.48-0.21-0.59l-31.42-14.87c-0.03-0.01-0.06-0.03-0.1-0.04c-0.42-0.14-0.87,0.09-1.01,0.52 | |||
| c-0.12,0.37-0.28,0.72-0.47,1.06c-9.29,15.92-25.76,30.49-44.18,34.45c-9.83,2.11-19.13,2.43-27.88,0.97 | |||
| c-30.48-5.08-53.56-27.7-59.25-58.04c-0.88-4.72-1.45-9.12-1.7-13.2c-0.03-0.45,0.19-0.67,0.64-0.67H702.1 | |||
| c0.45,0,0.7-0.23,0.74-0.68c2.69-28.85-3.42-58.64-20.13-82.12C666.94,526.62,642.21,513.15,614.94,510.07z M663,600.58H535.82 | |||
| c-0.43,0-0.6-0.21-0.51-0.64c2.95-13.33,8.25-24.64,15.9-33.91c10.88-13.18,26.74-21.54,43.93-22.57 | |||
| c3.74-0.22,7.72-0.21,11.93,0.04c16.35,0.95,32.82,8.76,43.04,21.59c7.82,9.8,12.29,21.44,13.42,34.91 | |||
| C663.56,600.38,663.38,600.58,663,600.58z"/> | |||
| </g> | |||
| </g> | |||
| <g> | |||
| <path class="st5" d="M1054.69,576.29c-1.93-16.86-8.45-33.49-19.59-46.27c-9.62-11.03-23.29-17.2-37.81-19.46 | |||
| c-11.6-1.81-23.18-1.75-34.74,0.18c-7.59,1.33-14.15,3.4-19.66,6.2c-15.08,7.65-27.25,18.71-36.5,33.2 | |||
| c-0.37,0.57-0.66,0.54-0.87-0.1c-0.63-1.85-1.42-3.65-2.38-5.41c-8.86-16.26-25.41-28.81-43.44-33.15 | |||
| c-13.41-3.23-26.6-2.98-39.55,0.73c-17.31,5.21-31.41,14.86-42.31,28.93c-1.33,1.71-2.6,3.7-3.8,5.96 | |||
| c-0.16,0.29-0.41,0.45-0.74,0.48l-0.5,0.04c-0.38,0.03-0.57-0.14-0.57-0.52l0.02-30.78c0,0,0-0.01,0-0.01 | |||
| c0-0.13-0.1-0.23-0.23-0.23h-35.5c-0.42,0-0.76,0.34-0.76,0.76l0.01,214.35c0,0.25,0.12,0.38,0.37,0.38l37.37,0.01 | |||
| c0.37,0,0.55-0.18,0.56-0.55c0.03-37.07,0-75.86-0.09-116.39c-0.02-6.81,0.32-12.29,1.01-16.44c4.42-26.52,23.44-53.23,52.48-54.48 | |||
| c24.5-0.56,42.87,10.8,47.47,35.65c1.19,6.43,1.79,12.91,1.8,19.46c0.06,42.99,0.08,87.05,0.05,132.2c0,0.36,0.18,0.54,0.53,0.54 | |||
| l36.76,0.01c0.23,0,0.42-0.19,0.42-0.42c0.09-37.85,0.07-75.53-0.04-113.04c-0.03-8.1,0.3-14.47,0.98-19.11 | |||
| c1.67-11.49,5.87-22.17,12.59-32.03c3.2-4.71,7.28-9.01,12.24-12.91c9.58-7.53,20.51-10.95,32.79-10.28 | |||
| c13.51,0.18,26.8,5.06,35.04,15.92c7.31,9.65,9.7,24.58,9.73,36.42c0.1,41.75,0.11,86.68,0.04,134.79c0,0.43,0.21,0.65,0.64,0.65 | |||
| l36.52,0.01c0.4,0,0.6-0.2,0.6-0.6c-0.11-43.76-0.11-88.64,0.02-134.65C1055.67,588.72,1055.35,582.04,1054.69,576.29z"/> | |||
| </g> | |||
| <g> | |||
| <linearGradient id="SVGID_00000006692382290725070250000008342888873359191228_" gradientUnits="userSpaceOnUse" x1="1162.6759" y1="620.3867" x2="1350.1307" y2="620.3867"> | |||
| <stop offset="0" style="stop-color:#439DDF"/> | |||
| <stop offset="0" style="stop-color:#4F87ED"/> | |||
| <stop offset="0" style="stop-color:#9177C7"/> | |||
| <stop offset="0.7815" style="stop-color:#9476C5"/> | |||
| <stop offset="0.8883" style="stop-color:#BC688E"/> | |||
| <stop offset="1" style="stop-color:#D6645D"/> | |||
| </linearGradient> | |||
| <path style="fill:url(#SVGID_00000006692382290725070250000008342888873359191228_);" d="M1341,549.28 | |||
| c-10.36-21.4-28.17-34.24-51.19-38.36c-26.08-4.67-51.48-0.1-72.37,16.89c-6.89,5.26-12.22,11.18-15.98,17.77 | |||
| c-1.85,3.25-2.78,3.01-2.78-0.73l0.01-28.19c0-0.39-0.19-0.58-0.58-0.58h-35.08c-0.19,0-0.35,0.16-0.35,0.36V730.9 | |||
| c0,0.45,0.22,0.68,0.67,0.68l37.12-0.01c0.34,0,0.51-0.17,0.51-0.51c0.07-38.4,0.06-77.08-0.03-116.03 | |||
| c-0.02-7.34,0.45-13.46,1.4-18.35c3.01-15.38,10.38-28.53,22.11-39.45c1.76-1.37,3.5-2.7,5.22-3.97 | |||
| c11.17-8.28,23.33-10.43,36.92-9.26c16.58,1.43,33.15,9.83,39.82,25.25c3.53,8.16,5.3,17.25,5.32,27.28 | |||
| c0.07,43.53,0.09,88.45,0.04,134.74c0,0.21,0.1,0.31,0.3,0.31h37.45c0.37,0,0.56-0.19,0.56-0.57c0.07-46.67,0.06-93.28-0.05-139.83 | |||
| C1350,575.62,1346.98,561.65,1341,549.28z M1217.67,529.37c0.01,0,0.03,0,0.04,0c0.02,0.01,0.04,0.02,0.06,0.02 | |||
| C1217.73,529.38,1217.7,529.37,1217.67,529.37z"/> | |||
| </g> | |||
| <g> | |||
| <path class="st2" d="M1127.23,516.08h-37.32c-0.15,0-0.27,0.12-0.27,0.27v214.96c0,0.15,0.12,0.27,0.27,0.27h37.32 | |||
| c0.15,0,0.27-0.12,0.27-0.27V516.35C1127.5,516.2,1127.38,516.08,1127.23,516.08z"/> | |||
| </g> | |||
| <g> | |||
| <linearGradient id="SVGID_00000158714738904643395990000007397907749964941716_" gradientUnits="userSpaceOnUse" x1="1381.8658" y1="623.8276" x2="1419.6459" y2="623.8276"> | |||
| <stop offset="0" style="stop-color:#439DDF"/> | |||
| <stop offset="0" style="stop-color:#4F87ED"/> | |||
| <stop offset="0" style="stop-color:#9476C5"/> | |||
| <stop offset="0" style="stop-color:#BC688E"/> | |||
| <stop offset="1" style="stop-color:#D6645D"/> | |||
| </linearGradient> | |||
| <path style="fill:url(#SVGID_00000158714738904643395990000007397907749964941716_);" d="M1419.38,516.08h-37.24 | |||
| c-0.15,0-0.27,0.12-0.27,0.27v214.96c0,0.15,0.12,0.27,0.27,0.27h37.24c0.15,0,0.27-0.12,0.27-0.27V516.35 | |||
| C1419.65,516.2,1419.53,516.08,1419.38,516.08z"/> | |||
| </g> | |||
| </svg> | |||
| @@ -1,27 +1,27 @@ | |||
| .navs { | |||
| ul { | |||
| padding: 0; | |||
| list-style: none; | |||
| display: flex; | |||
| } | |||
| li { | |||
| margin-right: 1em; | |||
| } | |||
| } | |||
| .layout { | |||
| height: 100vh; | |||
| } | |||
| body { | |||
| margin: 0; | |||
| } | |||
| .divider { | |||
| margin: 0; | |||
| } | |||
| .clickAvailable { | |||
| cursor: pointer; | |||
| } | |||
| .navs { | |||
| ul { | |||
| padding: 0; | |||
| list-style: none; | |||
| display: flex; | |||
| } | |||
| li { | |||
| margin-right: 1em; | |||
| } | |||
| } | |||
| .layout { | |||
| height: 100vh; | |||
| } | |||
| body { | |||
| margin: 0; | |||
| } | |||
| .divider { | |||
| margin: 0; | |||
| } | |||
| .clickAvailable { | |||
| cursor: pointer; | |||
| } | |||
| @@ -1,37 +1,37 @@ | |||
| import { Divider, Layout, theme } from 'antd'; | |||
| import React from 'react'; | |||
| import { Outlet } from 'umi'; | |||
| import '../locales/config'; | |||
| import Header from './components/header'; | |||
| import styles from './index.less'; | |||
| const { Content } = Layout; | |||
| const App: React.FC = () => { | |||
| const { | |||
| token: { colorBgContainer, borderRadiusLG }, | |||
| } = theme.useToken(); | |||
| return ( | |||
| <Layout className={styles.layout}> | |||
| <Layout> | |||
| <Header></Header> | |||
| <Divider orientationMargin={0} className={styles.divider} /> | |||
| <Content | |||
| style={{ | |||
| minHeight: 280, | |||
| background: colorBgContainer, | |||
| borderRadius: borderRadiusLG, | |||
| overflow: 'auto', | |||
| display: 'flex', | |||
| }} | |||
| > | |||
| <Outlet /> | |||
| </Content> | |||
| </Layout> | |||
| </Layout> | |||
| ); | |||
| }; | |||
| export default App; | |||
| import { Divider, Layout, theme } from 'antd'; | |||
| import React from 'react'; | |||
| import { Outlet } from 'umi'; | |||
| import '../locales/config'; | |||
| import Header from './components/header'; | |||
| import styles from './index.less'; | |||
| const { Content } = Layout; | |||
| const App: React.FC = () => { | |||
| const { | |||
| token: { colorBgContainer, borderRadiusLG }, | |||
| } = theme.useToken(); | |||
| return ( | |||
| <Layout className={styles.layout}> | |||
| <Layout> | |||
| <Header></Header> | |||
| <Divider orientationMargin={0} className={styles.divider} /> | |||
| <Content | |||
| style={{ | |||
| minHeight: 280, | |||
| background: colorBgContainer, | |||
| borderRadius: borderRadiusLG, | |||
| overflow: 'auto', | |||
| display: 'flex', | |||
| }} | |||
| > | |||
| <Outlet /> | |||
| </Content> | |||
| </Layout> | |||
| </Layout> | |||
| ); | |||
| }; | |||
| export default App; | |||
| @@ -1,30 +1,30 @@ | |||
| import i18n from 'i18next'; | |||
| import LanguageDetector from 'i18next-browser-languagedetector'; | |||
| import { initReactI18next } from 'react-i18next'; | |||
| import translation_en from './en'; | |||
| import translation_zh from './zh'; | |||
| import translation_zh_traditional from './zh-traditional'; | |||
| const resources = { | |||
| en: translation_en, | |||
| zh: translation_zh, | |||
| 'zh-TRADITIONAL': translation_zh_traditional, | |||
| }; | |||
| i18n | |||
| .use(initReactI18next) | |||
| .use(LanguageDetector) | |||
| .init({ | |||
| detection: { | |||
| lookupLocalStorage: 'lng', | |||
| }, | |||
| supportedLngs: ['en', 'zh', 'zh-TRADITIONAL'], | |||
| resources, | |||
| fallbackLng: 'en', | |||
| interpolation: { | |||
| escapeValue: false, | |||
| }, | |||
| }); | |||
| export default i18n; | |||
| import i18n from 'i18next'; | |||
| import LanguageDetector from 'i18next-browser-languagedetector'; | |||
| import { initReactI18next } from 'react-i18next'; | |||
| import translation_en from './en'; | |||
| import translation_zh from './zh'; | |||
| import translation_zh_traditional from './zh-traditional'; | |||
| const resources = { | |||
| en: translation_en, | |||
| zh: translation_zh, | |||
| 'zh-TRADITIONAL': translation_zh_traditional, | |||
| }; | |||
| i18n | |||
| .use(initReactI18next) | |||
| .use(LanguageDetector) | |||
| .init({ | |||
| detection: { | |||
| lookupLocalStorage: 'lng', | |||
| }, | |||
| supportedLngs: ['en', 'zh', 'zh-TRADITIONAL'], | |||
| resources, | |||
| fallbackLng: 'en', | |||
| interpolation: { | |||
| escapeValue: false, | |||
| }, | |||
| }); | |||
| export default i18n; | |||
| @@ -1,16 +1,19 @@ | |||
| import { Button, Result } from 'antd'; | |||
| import { history } from 'umi'; | |||
| const NoFoundPage = () => { | |||
| return (<Result | |||
| status="404" | |||
| title="404" | |||
| subTitle="页面未找到,请输入正确的地址。" | |||
| extra={< Button type="primary" onClick={() => history.push('/')}> | |||
| 返回主页 | |||
| </Button>} | |||
| /> | |||
| ) | |||
| }; | |||
| export default NoFoundPage; | |||
| import { Button, Result } from 'antd'; | |||
| import { history } from 'umi'; | |||
| const NoFoundPage = () => { | |||
| return ( | |||
| <Result | |||
| status="404" | |||
| title="404" | |||
| subTitle="页面未找到,请输入正确的地址。" | |||
| extra={ | |||
| <Button type="primary" onClick={() => history.push('/')}> | |||
| 返回主页 | |||
| </Button> | |||
| } | |||
| /> | |||
| ); | |||
| }; | |||
| export default NoFoundPage; | |||
| @@ -1,54 +1,54 @@ | |||
| .datasetWrapper { | |||
| padding: 30px 30px 0; | |||
| height: 100%; | |||
| } | |||
| .documentTable { | |||
| tbody { | |||
| // height: calc(100vh - 508px); | |||
| } | |||
| } | |||
| .filter { | |||
| height: 32px; | |||
| display: flex; | |||
| margin: 10px 0; | |||
| justify-content: space-between; | |||
| padding: 24px 0; | |||
| align-items: center; | |||
| } | |||
| .deleteIconWrapper { | |||
| width: 22px; | |||
| text-align: center; | |||
| } | |||
| .img { | |||
| height: 24px; | |||
| width: 24px; | |||
| display: inline-block; | |||
| vertical-align: middle; | |||
| } | |||
| .column { | |||
| min-width: 200px; | |||
| } | |||
| .toChunks { | |||
| cursor: pointer; | |||
| } | |||
| .pageInputNumber { | |||
| width: 220px; | |||
| } | |||
| .questionIcon { | |||
| margin-inline-start: 4px; | |||
| color: rgba(0, 0, 0, 0.45); | |||
| cursor: help; | |||
| writing-mode: horizontal-tb; | |||
| } | |||
| .nameText { | |||
| color: #1677ff; | |||
| } | |||
| .datasetWrapper { | |||
| padding: 30px 30px 0; | |||
| height: 100%; | |||
| } | |||
| .documentTable { | |||
| tbody { | |||
| // height: calc(100vh - 508px); | |||
| } | |||
| } | |||
| .filter { | |||
| height: 32px; | |||
| display: flex; | |||
| margin: 10px 0; | |||
| justify-content: space-between; | |||
| padding: 24px 0; | |||
| align-items: center; | |||
| } | |||
| .deleteIconWrapper { | |||
| width: 22px; | |||
| text-align: center; | |||
| } | |||
| .img { | |||
| height: 24px; | |||
| width: 24px; | |||
| display: inline-block; | |||
| vertical-align: middle; | |||
| } | |||
| .column { | |||
| min-width: 200px; | |||
| } | |||
| .toChunks { | |||
| cursor: pointer; | |||
| } | |||
| .pageInputNumber { | |||
| width: 220px; | |||
| } | |||
| .questionIcon { | |||
| margin-inline-start: 4px; | |||
| color: rgba(0, 0, 0, 0.45); | |||
| cursor: help; | |||
| writing-mode: horizontal-tb; | |||
| } | |||
| .nameText { | |||
| color: #1677ff; | |||
| } | |||