瀏覽代碼

fix: Large document thumbnail display failed (#2763)

### What problem does this PR solve?

In MySQL, when the thumbnail base64 of a document is relatively large,
the display of the document's thumbnail fails.
Now, I put the document thumbnail into MiniIO storage.

### Type of change

- [✓] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: chongchuanbing <chongchuanbing@gmail.com>
tags/v0.13.0
chongchuanbing 1 年之前
父節點
當前提交
485bfd6c08
No account linked to committer's email address
共有 4 個文件被更改,包括 29 次插入12 次删除
  1. 6
    0
      api/apps/document_app.py
  2. 3
    1
      api/contants.py
  3. 10
    3
      api/db/services/file_service.py
  4. 10
    8
      api/utils/file_utils.py

+ 6
- 0
api/apps/document_app.py 查看文件

from rag.utils.storage_factory import STORAGE_IMPL from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
from api.utils.web_utils import html2pdf, is_valid_url from api.utils.web_utils import html2pdf, is_valid_url
from api.contants import IMG_BASE64_PREFIX




@manager.route('/upload', methods=['POST']) @manager.route('/upload', methods=['POST'])
try: try:
docs, tol = DocumentService.get_by_kb_id( docs, tol = DocumentService.get_by_kb_id(
kb_id, page_number, items_per_page, orderby, desc, keywords) kb_id, page_number, items_per_page, orderby, desc, keywords)

for doc_item in docs:
if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
doc_item['thumbnail'] = f'/v1/document/image/{kb_id}-{doc_item['thumbnail']}'

return get_json_result(data={"total": tol, "docs": docs}) return get_json_result(data={"total": tol, "docs": docs})
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)

+ 3
- 1
api/contants.py 查看文件

# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.


NAME_LENGTH_LIMIT = 2 ** 10
NAME_LENGTH_LIMIT = 2 ** 10

IMG_BASE64_PREFIX = 'data:image/png;base64,'

+ 10
- 3
api/db/services/file_service.py 查看文件

from api.db.services.document_service import DocumentService from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.utils import get_uuid from api.utils import get_uuid
from api.utils.file_utils import filename_type, thumbnail
from api.utils.file_utils import filename_type, thumbnail_img
from rag.utils.storage_factory import STORAGE_IMPL from rag.utils.storage_factory import STORAGE_IMPL




location += "_" location += "_"
blob = file.read() blob = file.read()
STORAGE_IMPL.put(kb.id, location, blob) STORAGE_IMPL.put(kb.id, location, blob)

doc_id = get_uuid()

img = thumbnail_img(filename, blob)
thumbnail_location = f'thumbnail_{doc_id}.png'
STORAGE_IMPL.put(kb.id, thumbnail_location, img)

doc = { doc = {
"id": get_uuid(),
"id": doc_id,
"kb_id": kb.id, "kb_id": kb.id,
"parser_id": self.get_parser(filetype, filename, kb.parser_id), "parser_id": self.get_parser(filetype, filename, kb.parser_id),
"parser_config": kb.parser_config, "parser_config": kb.parser_config,
"name": filename, "name": filename,
"location": location, "location": location,
"size": len(blob), "size": len(blob),
"thumbnail": thumbnail(filename, blob)
"thumbnail": thumbnail_location
} }
DocumentService.insert(doc) DocumentService.insert(doc)



+ 10
- 8
api/utils/file_utils.py 查看文件

from ruamel.yaml import YAML from ruamel.yaml import YAML


from api.db import FileType from api.db import FileType
from api.contants import IMG_BASE64_PREFIX


PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE") PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
RAG_BASE = os.getenv("RAG_BASE") RAG_BASE = os.getenv("RAG_BASE")


return FileType.OTHER.value return FileType.OTHER.value



def thumbnail(filename, blob):
def thumbnail_img(filename, blob):
filename = filename.lower() filename = filename.lower()
if re.match(r".*\.pdf$", filename): if re.match(r".*\.pdf$", filename):
pdf = pdfplumber.open(BytesIO(blob)) pdf = pdfplumber.open(BytesIO(blob))
buffered = BytesIO() buffered = BytesIO()
pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png") pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png")
return "data:image/png;base64," + \
base64.b64encode(buffered.getvalue()).decode("utf-8")
return buffered.getvalue()


if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
image = Image.open(BytesIO(blob)) image = Image.open(BytesIO(blob))
image.thumbnail((30, 30)) image.thumbnail((30, 30))
buffered = BytesIO() buffered = BytesIO()
image.save(buffered, format="png") image.save(buffered, format="png")
return "data:image/png;base64," + \
base64.b64encode(buffered.getvalue()).decode("utf-8")
return buffered.getvalue()


if re.match(r".*\.(ppt|pptx)$", filename): if re.match(r".*\.(ppt|pptx)$", filename):
import aspose.slides as slides import aspose.slides as slides
buffered = BytesIO() buffered = BytesIO()
presentation.slides[0].get_thumbnail(0.03, 0.03).save( presentation.slides[0].get_thumbnail(0.03, 0.03).save(
buffered, drawing.imaging.ImageFormat.png) buffered, drawing.imaging.ImageFormat.png)
return "data:image/png;base64," + \
base64.b64encode(buffered.getvalue()).decode("utf-8")
return buffered.getvalue()
except Exception as e: except Exception as e:
pass pass
return None


def thumbnail(filename, blob):
img = thumbnail_img(filename, blob)
return IMG_BASE64_PREFIX + \
base64.b64encode(img).decode("utf-8")


def traversal_files(base): def traversal_files(base):
for root, ds, fs in os.walk(base): for root, ds, fs in os.walk(base):

Loading…
取消
儲存