### What problem does this PR solve? Rework Dockerfile.scratch - Multiple stage Dockerfile - Removed conda - Replaced pip with poetry - Added missing dependencies and fixed package version conflicts - Added deepdoc models ### Type of change - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):tags/v0.12.0
| FROM ubuntu:22.04 | |||||
| # base stage | |||||
| FROM ubuntu:24.04 AS base | |||||
| USER root | USER root | ||||
| WORKDIR /ragflow | WORKDIR /ragflow | ||||
| RUN apt-get update && apt-get install -y wget curl build-essential libopenmpi-dev | |||||
| RUN rm -f /etc/apt/apt.conf.d/docker-clean \ | |||||
| && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache | |||||
| RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ | |||||
| bash ~/miniconda.sh -b -p /root/miniconda3 && \ | |||||
| rm ~/miniconda.sh && ln -s /root/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ | |||||
| echo ". /root/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc && \ | |||||
| echo "conda activate base" >> ~/.bashrc | |||||
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |||||
| apt update && apt-get --no-install-recommends install -y ca-certificates | |||||
| ENV PATH /root/miniconda3/bin:$PATH | |||||
| # if you located in China, you can use tsinghua mirror to speed up apt | |||||
| RUN sed -i 's|http://archive.ubuntu.com|https://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list.d/ubuntu.sources | |||||
| RUN conda create -y --name py11 python=3.11 | |||||
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |||||
| apt update && apt install -y curl libpython3-dev nginx openmpi-bin openmpi-common libopenmpi-dev libglib2.0-0 libglx-mesa0 \ | |||||
| && rm -rf /var/lib/apt/lists/* \ | |||||
| && curl -sSL https://install.python-poetry.org | python3 - | |||||
| ENV CONDA_DEFAULT_ENV py11 | |||||
| ENV CONDA_PREFIX /root/miniconda3/envs/py11 | |||||
| ENV PATH $CONDA_PREFIX/bin:$PATH | |||||
| ENV PYTHONDONTWRITEBYTECODE=1 LD_LIBRARY_PATH=usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH | |||||
| RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - | |||||
| RUN apt-get install -y nodejs | |||||
| # Configure Poetry | |||||
| ENV POETRY_NO_INTERACTION=1 | |||||
| ENV POETRY_VIRTUALENVS_IN_PROJECT=true | |||||
| ENV POETRY_VIRTUALENVS_CREATE=true | |||||
| ENV POETRY_REQUESTS_TIMEOUT=15 | |||||
| RUN apt-get install -y nginx | |||||
| # builder stage | |||||
| FROM base AS builder | |||||
| USER root | |||||
| WORKDIR /ragflow | |||||
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |||||
| apt update && apt install -y nodejs npm && \ | |||||
| rm -rf /var/lib/apt/lists/* | |||||
| # if you located in China, you can use taobao registry to speed up npm and yarn | |||||
| RUN npm config set registry https://registry.npmmirror.com/ | |||||
| # https://yarnpkg.com/getting-started/install | |||||
| COPY web web | |||||
| RUN cd web && npm install -g corepack && corepack enable && yarn install && yarn run build | |||||
| ADD ./web ./web | |||||
| ADD ./api ./api | |||||
| ADD ./conf ./conf | |||||
| ADD ./deepdoc ./deepdoc | |||||
| ADD ./rag ./rag | |||||
| ADD ./requirements.txt ./requirements.txt | |||||
| ADD ./agent ./agent | |||||
| ADD ./graphrag ./graphrag | |||||
| # install dependencies from poetry.lock file | |||||
| COPY pyproject.toml poetry.toml poetry.lock ./ | |||||
| RUN --mount=type=cache,target=/root/.cache/pypoetry,sharing=locked \ | |||||
| /root/.local/bin/poetry install --sync --no-cache --no-root | |||||
| RUN apt install openmpi-bin openmpi-common libopenmpi-dev | |||||
| ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH | |||||
| RUN rm /root/miniconda3/envs/py11/compiler_compat/ld | |||||
| RUN cd ./web && npm i --force && npm run build | |||||
| RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt | |||||
| # production stage | |||||
| FROM base AS production | |||||
| USER root | |||||
| WORKDIR /ragflow | |||||
| RUN apt-get update && \ | |||||
| apt-get install -y libglib2.0-0 libgl1-mesa-glx && \ | |||||
| # Install python packages' dependencies | |||||
| # cv2 requires libGL.so.1 | |||||
| RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ | |||||
| apt update && apt install -y --no-install-recommends nginx libgl1 vim less && \ | |||||
| rm -rf /var/lib/apt/lists/* | rm -rf /var/lib/apt/lists/* | ||||
| RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ ollama | |||||
| RUN conda run -n py11 python -m nltk.downloader punkt | |||||
| RUN conda run -n py11 python -m nltk.downloader wordnet | |||||
| COPY web web | |||||
| COPY api api | |||||
| COPY conf conf | |||||
| COPY deepdoc deepdoc | |||||
| COPY rag rag | |||||
| COPY agent agent | |||||
| COPY graphrag graphrag | |||||
| COPY pyproject.toml poetry.toml poetry.lock ./ | |||||
| # Copy compiled web pages | |||||
| COPY --from=builder /ragflow/web/dist /ragflow/web/dist | |||||
| # Copy Python environment and packages | |||||
| ENV VIRTUAL_ENV=/ragflow/.venv | |||||
| COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} | |||||
| ENV PATH="${VIRTUAL_ENV}/bin:/root/.local/bin:${PATH}" | |||||
| # Download nltk data | |||||
| RUN python3 -m nltk.downloader wordnet punkt punkt_tab | |||||
| # Copy models downloaded via download_deps.sh | |||||
| COPY det.onnx layout.laws.onnx layout.manual.onnx layout.onnx layout.paper.onnx ocr.res rec.onnx tsr.onnx updown_concat_xgb.model /ragflow/rag/res/deepdoc/ | |||||
| ENV PYTHONPATH=/ragflow/ | ENV PYTHONPATH=/ragflow/ | ||||
| ENV HF_ENDPOINT=https://hf-mirror.com | |||||
| ADD docker/entrypoint.sh ./entrypoint.sh | |||||
| COPY docker/entrypoint.sh ./entrypoint.sh | |||||
| RUN chmod +x ./entrypoint.sh | RUN chmod +x ./entrypoint.sh | ||||
| ENTRYPOINT ["./entrypoint.sh"] | ENTRYPOINT ["./entrypoint.sh"] | 
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| print(""" | |||||
| print(r""" | |||||
| ____ ______ __ | ____ ______ __ | ||||
| / __ \ ____ _ ____ _ / ____// /____ _ __ | / __ \ ____ _ ____ _ / ____// /____ _ __ | ||||
| / /_/ // __ `// __ `// /_ / // __ \| | /| / / | / /_/ // __ `// __ `// /_ / // __ \| | /| / / | 
| SVR_HTTP_PORT=9380 | SVR_HTTP_PORT=9380 | ||||
| RAGFLOW_VERSION=dev | |||||
| RAGFLOW_VERSION=poetry | |||||
| TIMEZONE='Asia/Shanghai' | TIMEZONE='Asia/Shanghai' | ||||
| # Inside GFW, we need the following huggingface.co mirror: | |||||
| HF_ENDPOINT=https://hf-mirror.com | |||||
| ######## OS setup for ES ########### | ######## OS setup for ES ########### | ||||
| # sysctl vm.max_map_count | # sysctl vm.max_map_count | ||||
| # sudo sysctl -w vm.max_map_count=262144 | # sudo sysctl -w vm.max_map_count=262144 | 
| - ${SVR_HTTP_PORT}:9380 | - ${SVR_HTTP_PORT}:9380 | ||||
| - 80:80 | - 80:80 | ||||
| - 443:443 | - 443:443 | ||||
| - 5678:5678 | |||||
| volumes: | volumes: | ||||
| - ./service_conf.yaml:/ragflow/conf/service_conf.yaml | - ./service_conf.yaml:/ragflow/conf/service_conf.yaml | ||||
| - ./ragflow-logs:/ragflow/logs | - ./ragflow-logs:/ragflow/logs | ||||
| - ./nginx/nginx.conf:/etc/nginx/nginx.conf | - ./nginx/nginx.conf:/etc/nginx/nginx.conf | ||||
| environment: | environment: | ||||
| - TZ=${TIMEZONE} | - TZ=${TIMEZONE} | ||||
| - HF_ENDPOINT=https://huggingface.co | |||||
| - HF_ENDPOINT=${HF_ENDPOINT} | |||||
| - MACOS=${MACOS} | - MACOS=${MACOS} | ||||
| networks: | networks: | ||||
| - ragflow | - ragflow | 
| #!/usr/bin/env bash | |||||
| download() | |||||
| { | |||||
| echo "download $1" | |||||
| # https://stackoverflow.com/questions/3162385/how-to-split-a-string-in-shell-and-get-the-last-field | |||||
| fn=${1##*/} | |||||
| if [ ! -f $fn ] ; then | |||||
| wget --no-check-certificate $1 | |||||
| fi | |||||
| } | |||||
| # https://stackoverflow.com/questions/24628076/convert-multiline-string-to-array | |||||
| names="https://huggingface.co/InfiniFlow/deepdoc/resolve/main/det.onnx | |||||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.laws.onnx | |||||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.manual.onnx | |||||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.onnx | |||||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.paper.onnx | |||||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/ocr.res | |||||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/rec.onnx | |||||
| https://huggingface.co/InfiniFlow/deepdoc/resolve/main/tsr.onnx | |||||
| https://huggingface.co/InfiniFlow/text_concat_xgb_v1.0/resolve/main/updown_concat_xgb.model" | |||||
| SAVEIFS=$IFS # Save current IFS (Internal Field Separator) | |||||
| IFS=$'\n' # Change IFS to newline char | |||||
| names=($names) # split the `names` string into an array by the same name | |||||
| IFS=$SAVEIFS # Restore original IFS | |||||
| find . -size 0 | xargs rm -f | |||||
| # https://stackoverflow.com/questions/15466808/shell-iterate-over-array | |||||
| for ((i=0; i<${#names[@]}; i+=1)); do | |||||
| url="${names[$i]}" | |||||
| download $url | |||||
| if [ $? != 0 ]; then | |||||
| exit -1 | |||||
| fi | |||||
| done | |||||
| find . -size 0 | xargs rm -f | 
| [virtualenvs] | |||||
| in-project = true | |||||
| create = true | |||||
| prefer-active-python = true | 
| [tool.poetry] | |||||
| name = "ragflow" | |||||
| version = "0.11.0" | |||||
| description = "[RAGFlow](https://ragflow.io/) is an open-source RAG (Retrieval-Augmented Generation) engine based on deep document understanding. It offers a streamlined RAG workflow for businesses of any scale, combining LLM (Large Language Models) to provide truthful question-answering capabilities, backed by well-founded citations from various complex formatted data." | |||||
| authors = ["Your Name <you@example.com>"] | |||||
| license = "https://github.com/infiniflow/ragflow/blob/main/LICENSE" | |||||
| readme = "README.md" | |||||
| package-mode = false | |||||
| [tool.poetry.dependencies] | |||||
| python = ">=3.12,<3.13" | |||||
| datrie = "0.8.2" | |||||
| akshare = "1.14.72" | |||||
| azure-storage-blob = "12.22.0" | |||||
| azure-identity = "1.17.1" | |||||
| azure-storage-file-datalake = "12.16.0" | |||||
| anthropic = "=0.34.1" | |||||
| arxiv = "2.1.3" | |||||
| aspose-slides = "24.8.0" | |||||
| bcembedding = "0.1.3" | |||||
| bio = "1.7.1" | |||||
| boto3 = "1.34.140" | |||||
| botocore = "1.34.140" | |||||
| cachetools = "5.3.3" | |||||
| chardet = "5.2.0" | |||||
| cn2an = "0.5.22" | |||||
| cohere = "5.6.2" | |||||
| dashscope = "1.14.1" | |||||
| deepl = "1.18.0" | |||||
| demjson3 = "3.0.6" | |||||
| discord-py = "2.3.2" | |||||
| duckduckgo-search = "6.1.9" | |||||
| editdistance = "0.8.1" | |||||
| elastic-transport = "8.12.0" | |||||
| elasticsearch = "8.12.1" | |||||
| elasticsearch-dsl = "8.12.0" | |||||
| fastembed = "^0.3.6" | |||||
| fasttext = "0.9.3" | |||||
| filelock = "3.15.4" | |||||
| flagembedding = "1.2.10" | |||||
| flask = "3.0.3" | |||||
| flask-cors = "5.0.0" | |||||
| flask-login = "0.6.3" | |||||
| flask-session = "0.8.0" | |||||
| google-search-results = "2.4.2" | |||||
| groq = "0.9.0" | |||||
| hanziconv = "0.3.2" | |||||
| html-text = "0.6.2" | |||||
| httpx = "0.27.0" | |||||
| huggingface-hub = "^0.25.0" | |||||
| infinity-emb = "0.0.51" | |||||
| itsdangerous = "2.1.2" | |||||
| markdown = "3.6" | |||||
| markdown-to-json = "2.1.1" | |||||
| minio = "7.2.4" | |||||
| mistralai = "0.4.2" | |||||
| nltk = "3.9.1" | |||||
| numpy = "1.26.4" | |||||
| ollama = "0.2.1" | |||||
| onnxruntime = "1.17.3" | |||||
| onnxruntime-gpu = "1.17.1" | |||||
| openai = "1.12.0" | |||||
| opencv-python = "4.9.0.80" | |||||
| opencv-python-headless = "4.9.0.80" | |||||
| openpyxl = "3.1.2" | |||||
| ormsgpack = "1.5.0" | |||||
| pandas = "2.2.2" | |||||
| pdfplumber = "0.10.4" | |||||
| peewee = "3.17.1" | |||||
| pillow = "10.3.0" | |||||
| protobuf = "5.27.2" | |||||
| psycopg2-binary = "2.9.9" | |||||
| pyclipper = "1.3.0.post5" | |||||
| pycryptodomex = "3.20.0" | |||||
| pypdf = "^5.0.0" | |||||
| pytest = "8.2.2" | |||||
| python-dotenv = "1.0.1" | |||||
| python-dateutil = "2.8.2" | |||||
| python-pptx = "0.6.23" | |||||
| pywencai = "0.12.2" | |||||
| qianfan = "0.4.6" | |||||
| ranx = "0.3.20" | |||||
| readability-lxml = "0.8.1" | |||||
| redis = "5.0.3" | |||||
| requests = "2.32.2" | |||||
| replicate = "0.31.0" | |||||
| roman-numbers = "1.0.2" | |||||
| ruamel-base = "1.0.0" | |||||
| scholarly = "1.7.11" | |||||
| scikit-learn = "1.5.0" | |||||
| selenium = "4.22.0" | |||||
| setuptools = "70.0.0" | |||||
| shapely = "2.0.5" | |||||
| six = "1.16.0" | |||||
| strenum = "0.4.15" | |||||
| tabulate = "0.9.0" | |||||
| tencentcloud-sdk-python = "3.0.1215" | |||||
| tika = "2.6.0" | |||||
| tiktoken = "0.6.0" | |||||
| torch = "2.3.0" | |||||
| transformers = "4.38.1" | |||||
| umap = "0.1.1" | |||||
| vertexai = "1.64.0" | |||||
| volcengine = "1.0.146" | |||||
| voyageai = "0.2.3" | |||||
| webdriver-manager = "4.0.1" | |||||
| werkzeug = "3.0.3" | |||||
| wikipedia = "1.4.0" | |||||
| word2number = "1.1" | |||||
| xgboost = "2.1.0" | |||||
| xpinyin = "0.7.6" | |||||
| yfinance = "0.1.96" | |||||
| zhipuai = "2.0.1" | |||||
| ruamel-yaml = "^0.18.6" | |||||
| google-generativeai = "^0.8.1" | |||||
| python-docx = "^1.1.2" | |||||
| pypdf2 = "^3.0.1" | |||||
| graspologic = "^3.4.1" | |||||
| pymysql = "^1.1.1" | |||||
| [[tool.poetry.source]] | |||||
| name = "tsinghua" | |||||
| url = "https://pypi.tuna.tsinghua.edu.cn/simple/" | |||||
| priority = "primary" | |||||
| [build-system] | |||||
| requires = ["poetry-core"] | |||||
| build-backend = "poetry.core.masonry.api" | 
| azure-storage-file-datalake==12.16.0 | azure-storage-file-datalake==12.16.0 | ||||
| anthropic===0.34.1 | anthropic===0.34.1 | ||||
| arxiv==2.1.3 | arxiv==2.1.3 | ||||
| Aspose.Slides==24.2.0 | |||||
| Aspose.Slides==24.8.0 | |||||
| BCEmbedding==0.1.3 | BCEmbedding==0.1.3 | ||||
| Bio==1.7.1 | Bio==1.7.1 | ||||
| boto3==1.34.140 | boto3==1.34.140 | ||||
| markdown_to_json==2.1.1 | markdown_to_json==2.1.1 | ||||
| minio==7.2.4 | minio==7.2.4 | ||||
| mistralai==0.4.2 | mistralai==0.4.2 | ||||
| nltk==3.9 | |||||
| nltk==3.9.1 | |||||
| numpy==1.26.4 | numpy==1.26.4 | ||||
| ollama==0.2.1 | ollama==0.2.1 | ||||
| onnxruntime==1.17.3 | onnxruntime==1.17.3 | ||||
| pdfplumber==0.10.4 | pdfplumber==0.10.4 | ||||
| peewee==3.17.1 | peewee==3.17.1 | ||||
| Pillow==10.3.0 | Pillow==10.3.0 | ||||
| pipreqs==0.5.0 | |||||
| protobuf==5.27.2 | protobuf==5.27.2 | ||||
| psycopg2-binary==2.9.9 | psycopg2-binary==2.9.9 | ||||
| pyclipper==1.3.0.post5 | pyclipper==1.3.0.post5 | 
| Types: deb | |||||
| URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||||
| Suites: noble noble-updates noble-backports | |||||
| Components: main restricted universe multiverse | |||||
| Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||||
| # 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释 | |||||
| # Types: deb-src | |||||
| # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||||
| # Suites: noble noble-updates noble-backports | |||||
| # Components: main restricted universe multiverse | |||||
| # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||||
| # 以下安全更新软件源包含了官方源与镜像站配置,如有需要可自行修改注释切换 | |||||
| Types: deb | |||||
| URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||||
| Suites: noble-security | |||||
| Components: main restricted universe multiverse | |||||
| Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||||
| # Types: deb-src | |||||
| # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||||
| # Suites: noble-security | |||||
| # Components: main restricted universe multiverse | |||||
| # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||||
| # 预发布软件源,不建议启用 | |||||
| # Types: deb | |||||
| # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||||
| # Suites: noble-proposed | |||||
| # Components: main restricted universe multiverse | |||||
| # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | |||||
| # # Types: deb-src | |||||
| # # URIs: https://mirrors.tuna.tsinghua.edu.cn/ubuntu | |||||
| # # Suites: noble-proposed | |||||
| # # Components: main restricted universe multiverse | |||||
| # # Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg | 
| "lint": "umi lint --eslint-only", | "lint": "umi lint --eslint-only", | ||||
| "prepare": "cd .. && husky web/.husky", | "prepare": "cd .. && husky web/.husky", | ||||
| "setup": "umi setup", | "setup": "umi setup", | ||||
| "start": "npm run dev", | |||||
| "start": "yarn dev", | |||||
| "test": "jest --no-cache --coverage" | "test": "jest --no-cache --coverage" | ||||
| }, | }, | ||||
| "lint-staged": { | "lint-staged": { |