### What problem does this PR solve? Rework Dockerfile.scratch - Multiple stage Dockerfile - Removed conda - Replaced pip with poetry - Added missing dependencies and fixed package version conflicts - Added deepdoc models ### Type of change - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):tags/v0.12.0
| @@ -1,56 +1,91 @@ | |||
| FROM ubuntu:22.04 | |||
| # base stage | |||
| FROM ubuntu:24.04 AS base | |||
| USER root | |||
| WORKDIR /ragflow | |||
| RUN apt-get update && apt-get install -y wget curl build-essential libopenmpi-dev | |||
| RUN rm -f /etc/apt/apt.conf.d/docker-clean \ | |||
| && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache | |||
| RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ | |||
| bash ~/miniconda.sh -b -p /root/miniconda3 && \ | |||
| rm ~/miniconda.sh && ln -s /root/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ | |||
| echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc && \ | |||
| echo "conda activate base" >> ~/.bashrc | |||
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |||
| apt update && apt-get --no-install-recommends install -y ca-certificates | |||
| ENV PATH /root/miniconda3/bin:$PATH | |||
| # if you located in China, you can use tsinghua mirror to speed up apt | |||
| RUN sed -i 's|http://archive.ubuntu.com|https://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list.d/ubuntu.sources | |||
| RUN conda create -y --name py11 python=3.11 | |||
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |||
| apt update && apt install -y curl libpython3-dev nginx openmpi-bin openmpi-common libopenmpi-dev libglib2.0-0 libglx-mesa0 \ | |||
| && rm -rf /var/lib/apt/lists/* \ | |||
| && curl -sSL https://install.python-poetry.org | python3 - | |||
| ENV CONDA_DEFAULT_ENV py11 | |||
| ENV CONDA_PREFIX /root/miniconda3/envs/py11 | |||
| ENV PATH $CONDA_PREFIX/bin:$PATH | |||
| ENV PYTHONDONTWRITEBYTECODE=1 LD_LIBRARY_PATH=usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH | |||
| RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - | |||
| RUN apt-get install -y nodejs | |||
| # Configure Poetry | |||
| ENV POETRY_NO_INTERACTION=1 | |||
| ENV POETRY_VIRTUALENVS_IN_PROJECT=true | |||
| ENV POETRY_VIRTUALENVS_CREATE=true | |||
| ENV POETRY_REQUESTS_TIMEOUT=15 | |||
| RUN apt-get install -y nginx | |||
| # builder stage | |||
| FROM base AS builder | |||
| USER root | |||
| WORKDIR /ragflow | |||
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |||
| apt update && apt install -y nodejs npm && \ | |||
| rm -rf /var/lib/apt/lists/* | |||
| # if you located in China, you can use taobao registry to speed up npm and yarn | |||
| RUN npm config set registry https://registry.npmmirror.com/ | |||
| # https://yarnpkg.com/getting-started/install | |||
| COPY web web | |||
| RUN cd web && npm install -g corepack && corepack enable && yarn install && yarn run build | |||
| ADD ./web ./web | |||
| ADD ./api ./api | |||
| ADD ./conf ./conf | |||
| ADD ./deepdoc ./deepdoc | |||
| ADD ./rag ./rag | |||
| ADD ./requirements.txt ./requirements.txt | |||
| ADD ./agent ./agent | |||
| ADD ./graphrag ./graphrag | |||
| # install dependencies from poetry.lock file | |||
| COPY pyproject.toml poetry.toml poetry.lock ./ | |||
| RUN --mount=type=cache,target=/root/.cache/pypoetry,sharing=locked \ | |||
| /root/.local/bin/poetry install --sync --no-cache --no-root | |||
| RUN apt install openmpi-bin openmpi-common libopenmpi-dev | |||
| ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH | |||
| RUN rm /root/miniconda3/envs/py11/compiler_compat/ld | |||
| RUN cd ./web && npm i --force && npm run build | |||
| RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt | |||
| # production stage | |||
| FROM base AS production | |||
| USER root | |||
| WORKDIR /ragflow | |||
| RUN apt-get update && \ | |||
| apt-get install -y libglib2.0-0 libgl1-mesa-glx && \ | |||
| # Install python packages' dependencies | |||
| # cv2 requires libGL.so.1 | |||
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |||
| apt update && apt install -y --no-install-recommends nginx libgl1 vim less && \ | |||
| rm -rf /var/lib/apt/lists/* | |||
| RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ ollama | |||
| RUN conda run -n py11 python -m nltk.downloader punkt | |||
| RUN conda run -n py11 python -m nltk.downloader wordnet | |||
| COPY web web | |||
| COPY api api | |||
| COPY conf conf | |||
| COPY deepdoc deepdoc | |||
| COPY rag rag | |||
| COPY agent agent | |||
| COPY graphrag graphrag | |||
| COPY pyproject.toml poetry.toml poetry.lock ./ | |||
| # Copy compiled web pages | |||
| COPY --from=builder /ragflow/web/dist /ragflow/web/dist | |||
| # Copy Python environment and packages | |||
| ENV VIRTUAL_ENV=/ragflow/.venv | |||
| COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} | |||
| ENV PATH="${VIRTUAL_ENV}/bin:/root/.local/bin:${PATH}" | |||
| # Download nltk data | |||
| RUN python3 -m nltk.downloader wordnet punkt punkt_tab | |||
| # Copy models downloaded via download_deps.sh | |||
| COPY det.onnx layout.laws.onnx layout.manual.onnx layout.onnx layout.paper.onnx ocr.res rec.onnx tsr.onnx updown_concat_xgb.model /ragflow/rag/res/deepdoc/ | |||
| ENV PYTHONPATH=/ragflow/ | |||
| ENV HF_ENDPOINT=https://hf-mirror.com | |||
| ADD docker/entrypoint.sh ./entrypoint.sh | |||
| COPY docker/entrypoint.sh ./entrypoint.sh | |||
| RUN chmod +x ./entrypoint.sh | |||
| ENTRYPOINT ["./entrypoint.sh"] | |||
| @@ -46,7 +46,7 @@ def update_progress(): | |||
| if __name__ == '__main__': | |||
| print(""" | |||
| print(r""" | |||
| ____ ______ __ | |||
| / __ \ ____ _ ____ _ / ____// /____ _ __ | |||
| / /_/ // __ `// __ `// /_ / // __ \| | /| / / | |||
| @@ -33,10 +33,13 @@ REDIS_PASSWORD=infini_rag_flow | |||
| SVR_HTTP_PORT=9380 | |||
| RAGFLOW_VERSION=dev | |||
| RAGFLOW_VERSION=poetry | |||
| TIMEZONE='Asia/Shanghai' | |||
| # Inside GFW, we need the following huggingface.co mirror: | |||
| HF_ENDPOINT=https://hf-mirror.com | |||
| ######## OS setup for ES ########### | |||
| # sysctl vm.max_map_count | |||
| # sudo sysctl -w vm.max_map_count=262144 | |||
| @@ -15,6 +15,7 @@ services: | |||
| - ${SVR_HTTP_PORT}:9380 | |||
| - 80:80 | |||
| - 443:443 | |||
| - 5678:5678 | |||
| volumes: | |||
| - ./service_conf.yaml:/ragflow/conf/service_conf.yaml | |||
| - ./ragflow-logs:/ragflow/logs | |||
| @@ -23,7 +24,7 @@ services: | |||
| - ./nginx/nginx.conf:/etc/nginx/nginx.conf | |||
| environment: | |||
| - TZ=${TIMEZONE} | |||
| - HF_ENDPOINT=https://huggingface.co | |||
| - HF_ENDPOINT=${HF_ENDPOINT} | |||
| - MACOS=${MACOS} | |||
| networks: | |||
| - ragflow | |||
| @@ -0,0 +1,38 @@ | |||
| #!/usr/bin/env bash | |||
| download() | |||
| { | |||
| echo "download $1" | |||
| # https://stackoverflow.com/questions/3162385/how-to-split-a-string-in-shell-and-get-the-last-field | |||
| fn=${1##*/} | |||
| if [ ! -f $fn ] ; then | |||
| wget --no-check-certificate $1 | |||
| fi | |||
| } | |||
| # https://stackoverflow.com/questions/24628076/convert-multiline-string-to-array | |||
| names="https://huggingface.co/InfiniFlow/deepdoc/resolve/main/det.onnx | |||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.laws.onnx | |||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.manual.onnx | |||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.onnx | |||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.paper.onnx | |||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/ocr.res | |||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/rec.onnx | |||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/tsr.onnx | |||
| https://huggingface.co/InfiniFlow/text_concat_xgb_v1.0/resolve/main/updown_concat_xgb.model" | |||
| SAVEIFS=$IFS # Save current IFS (Internal Field Separator) | |||
| IFS=$'\n' # Change IFS to newline char | |||
| names=($names) # split the `names` string into an array by the same name | |||
| IFS=$SAVEIFS # Restore original IFS | |||
| find . -size 0 | xargs rm -f | |||
| # https://stackoverflow.com/questions/15466808/shell-iterate-over-array | |||
| for ((i=0; i<${#names[@]}; i+=1)); do | |||
| url="${names[$i]}" | |||
| download $url | |||
| if [ $? != 0 ]; then | |||
| exit -1 | |||
| fi | |||
| done | |||
| find . -size 0 | xargs rm -f | |||
| @@ -0,0 +1,4 @@ | |||
| [virtualenvs] | |||
| in-project = true | |||
| create = true | |||
| prefer-active-python = true | |||
| @@ -0,0 +1,129 @@ | |||
| [tool.poetry] | |||
| name = "ragflow" | |||
| version = "0.11.0" | |||
| description = "[RAGFlow](https://ragflow.io/) is an open-source RAG (Retrieval-Augmented Generation) engine based on deep document understanding. It offers a streamlined RAG workflow for businesses of any scale, combining LLM (Large Language Models) to provide truthful question-answering capabilities, backed by well-founded citations from various complex formatted data." | |||
| authors = ["Your Name <you@example.com>"] | |||
| license = "https://github.com/infiniflow/ragflow/blob/main/LICENSE" | |||
| readme = "README.md" | |||
| package-mode = false | |||
| [tool.poetry.dependencies] | |||
| python = ">=3.12,<3.13" | |||
| datrie = "0.8.2" | |||
| akshare = "1.14.72" | |||
| azure-storage-blob = "12.22.0" | |||
| azure-identity = "1.17.1" | |||
| azure-storage-file-datalake = "12.16.0" | |||
| anthropic = "=0.34.1" | |||
| arxiv = "2.1.3" | |||
| aspose-slides = "24.8.0" | |||
| bcembedding = "0.1.3" | |||
| bio = "1.7.1" | |||
| boto3 = "1.34.140" | |||
| botocore = "1.34.140" | |||
| cachetools = "5.3.3" | |||
| chardet = "5.2.0" | |||
| cn2an = "0.5.22" | |||
| cohere = "5.6.2" | |||
| dashscope = "1.14.1" | |||
| deepl = "1.18.0" | |||
| demjson3 = "3.0.6" | |||
| discord-py = "2.3.2" | |||
| duckduckgo-search = "6.1.9" | |||
| editdistance = "0.8.1" | |||
| elastic-transport = "8.12.0" | |||
| elasticsearch = "8.12.1" | |||
| elasticsearch-dsl = "8.12.0" | |||
| fastembed = "^0.3.6" | |||
| fasttext = "0.9.3" | |||
| filelock = "3.15.4" | |||
| flagembedding = "1.2.10" | |||
| flask = "3.0.3" | |||
| flask-cors = "5.0.0" | |||
| flask-login = "0.6.3" | |||
| flask-session = "0.8.0" | |||
| google-search-results = "2.4.2" | |||
| groq = "0.9.0" | |||
| hanziconv = "0.3.2" | |||
| html-text = "0.6.2" | |||
| httpx = "0.27.0" | |||
| huggingface-hub = "^0.25.0" | |||
| infinity-emb = "0.0.51" | |||
| itsdangerous = "2.1.2" | |||
| markdown = "3.6" | |||
| markdown-to-json = "2.1.1" | |||
| minio = "7.2.4" | |||
| mistralai = "0.4.2" | |||
| nltk = "3.9.1" | |||
| numpy = "1.26.4" | |||
| ollama = "0.2.1" | |||
| onnxruntime = "1.17.3" | |||
| onnxruntime-gpu = "1.17.1" | |||
| openai = "1.12.0" | |||
| opencv-python = "4.9.0.80" | |||
| opencv-python-headless = "4.9.0.80" | |||
| openpyxl = "3.1.2" | |||
| ormsgpack = "1.5.0" | |||
| pandas = "2.2.2" | |||
| pdfplumber = "0.10.4" | |||
| peewee = "3.17.1" | |||
| pillow = "10.3.0" | |||
| protobuf = "5.27.2" | |||
| psycopg2-binary = "2.9.9" | |||
| pyclipper = "1.3.0.post5" | |||
| pycryptodomex = "3.20.0" | |||
| pypdf = "^5.0.0" | |||
| pytest = "8.2.2" | |||
| python-dotenv = "1.0.1" | |||
| python-dateutil = "2.8.2" | |||
| python-pptx = "0.6.23" | |||
| pywencai = "0.12.2" | |||
| qianfan = "0.4.6" | |||
| ranx = "0.3.20" | |||
| readability-lxml = "0.8.1" | |||
| redis = "5.0.3" | |||
| requests = "2.32.2" | |||
| replicate = "0.31.0" | |||
| roman-numbers = "1.0.2" | |||
| ruamel-base = "1.0.0" | |||
| scholarly = "1.7.11" | |||
| scikit-learn = "1.5.0" | |||
| selenium = "4.22.0" | |||
| setuptools = "70.0.0" | |||
| shapely = "2.0.5" | |||
| six = "1.16.0" | |||
| strenum = "0.4.15" | |||
| tabulate = "0.9.0" | |||
| tencentcloud-sdk-python = "3.0.1215" | |||
| tika = "2.6.0" | |||
| tiktoken = "0.6.0" | |||
| torch = "2.3.0" | |||
| transformers = "4.38.1" | |||
| umap = "0.1.1" | |||
| vertexai = "1.64.0" | |||
| volcengine = "1.0.146" | |||
| voyageai = "0.2.3" | |||
| webdriver-manager = "4.0.1" | |||
| werkzeug = "3.0.3" | |||
| wikipedia = "1.4.0" | |||
| word2number = "1.1" | |||
| xgboost = "2.1.0" | |||
| xpinyin = "0.7.6" | |||
| yfinance = "0.1.96" | |||
| zhipuai = "2.0.1" | |||
| ruamel-yaml = "^0.18.6" | |||
| google-generativeai = "^0.8.1" | |||
| python-docx = "^1.1.2" | |||
| pypdf2 = "^3.0.1" | |||
| graspologic = "^3.4.1" | |||
| pymysql = "^1.1.1" | |||
| [[tool.poetry.source]] | |||
| name = "tsinghua" | |||
| url = "https://pypi.tuna.tsinghua.edu.cn/simple/" | |||
| priority = "primary" | |||
| [build-system] | |||
| requires = ["poetry-core"] | |||
| build-backend = "poetry.core.masonry.api" | |||
| @@ -4,7 +4,7 @@ azure-identity==1.17.1 | |||
| azure-storage-file-datalake==12.16.0 | |||
| anthropic===0.34.1 | |||
| arxiv==2.1.3 | |||
| Aspose.Slides==24.2.0 | |||
| Aspose.Slides==24.8.0 | |||
| BCEmbedding==0.1.3 | |||
| Bio==1.7.1 | |||
| boto3==1.34.140 | |||
| @@ -43,7 +43,7 @@ Markdown==3.6 | |||
| markdown_to_json==2.1.1 | |||
| minio==7.2.4 | |||
| mistralai==0.4.2 | |||
| nltk==3.9 | |||
| nltk==3.9.1 | |||
| numpy==1.26.4 | |||
| ollama==0.2.1 | |||
| onnxruntime==1.17.3 | |||
| @@ -57,7 +57,6 @@ pandas==2.2.2 | |||
| pdfplumber==0.10.4 | |||
| peewee==3.17.1 | |||
| Pillow==10.3.0 | |||
| pipreqs==0.5.0 | |||
| protobuf==5.27.2 | |||
| psycopg2-binary==2.9.9 | |||
| pyclipper==1.3.0.post5 | |||
| @@ -0,0 +1,39 @@ | |||
| Types: deb | |||
| URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||
| Suites: noble noble-updates noble-backports | |||
| Components: main restricted universe multiverse | |||
| Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||
| # 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释 | |||
| # Types: deb-src | |||
| # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||
| # Suites: noble noble-updates noble-backports | |||
| # Components: main restricted universe multiverse | |||
| # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||
| # 以下安全更新软件源包含了官方源与镜像站配置,如有需要可自行修改注释切换 | |||
| Types: deb | |||
| URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||
| Suites: noble-security | |||
| Components: main restricted universe multiverse | |||
| Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||
| # Types: deb-src | |||
| # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||
| # Suites: noble-security | |||
| # Components: main restricted universe multiverse | |||
| # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||
| # 预发布软件源,不建议启用 | |||
| # Types: deb | |||
| # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||
| # Suites: noble-proposed | |||
| # Components: main restricted universe multiverse | |||
| # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||
| # # Types: deb-src | |||
| # # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||
| # # Suites: noble-proposed | |||
| # # Components: main restricted universe multiverse | |||
| # # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||
| @@ -8,7 +8,7 @@ | |||
| "lint": "umi lint --eslint-only", | |||
| "prepare": "cd .. && husky web/.husky", | |||
| "setup": "umi setup", | |||
| "start": "npm run dev", | |||
| "start": "yarn dev", | |||
| "test": "jest --no-cache --coverage" | |||
| }, | |||
| "lint-staged": { | |||