Selaa lähdekoodia

Feat: Markdown add image (#7124)

### What problem does this PR solve?

https://github.com/infiniflow/ragflow/issues/6984

1. Markdown parser supports get pictures
2. For Native, when handling Markdown, it will handle images
3. improve merge and 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
tags/v0.19.0
Stephen Hu 6 kuukautta sitten
vanhempi
commit
1662c7eda3
No account linked to committer's email address

+ 77
- 12
rag/app/naive.py Näytä tiedosto



from docx import Document from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from markdown import markdown
import markdown
from PIL import Image from PIL import Image
from tika import parser from tika import parser


from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string








class Markdown(MarkdownParser): class Markdown(MarkdownParser):
def get_picture_urls(self, sections):
if not sections:
return []
if isinstance(sections, type("")):
text = sections
elif isinstance(sections[0], type("")):
text = sections[0]
else:
return []
from bs4 import BeautifulSoup
md = markdown.Markdown()
html_content = md.convert(text)
soup = BeautifulSoup(html_content, 'html.parser')
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
return html_images
def get_pictures(self, text):
"""Download and open all images from markdown text."""
import requests
image_urls = self.get_picture_urls(text)
images = []
# Find all image URLs in text
for url in image_urls:
try:
response = requests.get(url, stream=True, timeout=30)
if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
img = Image.open(BytesIO(response.content)).convert('RGB')
images.append(img)
except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}")
continue
return images if images else None

def __call__(self, filename, binary=None): def __call__(self, filename, binary=None):
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = [] res = []
pdf_parser = None pdf_parser = None
section_images = None
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")


if kwargs.get("section_only", False): if kwargs.get("section_only", False):
return chunks return chunks


res.extend(tokenize_chunks_docx(chunks, doc, is_english, images))
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res return res




elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary)
# Process images for each section
section_images = []
for section_text, _ in sections:
images = markdown_parser.get_pictures(section_text) if section_text else None
if images:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
section_images.append(combined_image)
else:
section_images.append(None)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")


"file type not supported yet(pdf, xlsx, doc, docx, txt supported)") "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")


st = timer() st = timer()
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks

res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
if section_images:
# if all images are None, set section_images to None
if all(image is None for image in section_images):
section_images = None

if section_images:
chunks, images = naive_merge_with_images(sections, section_images,
int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else:
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks

res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res return res



+ 40
- 3
rag/nlp/__init__.py Näytä tiedosto

res.append(d) res.append(d)
return res return res



def tokenize_chunks_docx(chunks, doc, eng, images):
def tokenize_chunks_with_images(chunks, doc, eng, images):
res = [] res = []
# wrap up as es documents # wrap up as es documents
for ck, image in zip(chunks, images): for ck, image in zip(chunks, images):
res.append(d) res.append(d)
return res return res



def tokenize_table(tbls, doc, eng, batch_size=10): def tokenize_table(tbls, doc, eng, batch_size=10):
res = [] res = []
# add tables # add tables
add_chunk(sec, pos) add_chunk(sec, pos)


return cks return cks

def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"):
if not texts or len(texts) != len(images):
return [], []
# Enuser texts is str not tuple, if it is tuple, convert to str (get the first item)
if isinstance(texts[0], tuple):
texts = [t[0] for t in texts]
cks = [""]
result_images = [None]
tk_nums = [0]

def add_chunk(t, image, pos=""):
nonlocal cks, result_images, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if not pos:
pos = ""
if tnum < 8:
pos = ""
# Ensure that the length of the merged chunk does not exceed chunk_token_num
if tk_nums[-1] > chunk_token_num:
if t.find(pos) < 0:
t += pos
cks.append(t)
result_images.append(image)
tk_nums.append(tnum)
else:
if cks[-1].find(pos) < 0:
t += pos
cks[-1] += t
if result_images[-1] is None:
result_images[-1] = image
else:
result_images[-1] = concat_img(result_images[-1], image)
tk_nums[-1] += tnum

for text, image in zip(texts, images):
add_chunk(text, image)


return cks, result_images


def docx_question_level(p, bull=-1): def docx_question_level(p, bull=-1):
txt = re.sub(r"\u3000", " ", p.text).strip() txt = re.sub(r"\u3000", " ", p.text).strip()

+ 1
- 1
web/src/interfaces/database/knowledge.ts Näytä tiedosto

content_with_weight: string; content_with_weight: string;
doc_id: string; doc_id: string;
doc_name: string; doc_name: string;
img_id: string;
image_id: string;
important_kwd?: string[]; important_kwd?: string[];
question_kwd?: string[]; // keywords question_kwd?: string[]; // keywords
tag_kwd?: string[]; tag_kwd?: string[];

+ 3
- 3
web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx Näytä tiedosto

> >
<Flex gap={'middle'} justify={'space-between'}> <Flex gap={'middle'} justify={'space-between'}>
<Checkbox onChange={handleCheck} checked={checked}></Checkbox> <Checkbox onChange={handleCheck} checked={checked}></Checkbox>
{item.img_id && (
{item.image_id && (
<Popover <Popover
placement="right" placement="right"
content={ content={
<Image id={item.img_id} className={styles.imagePreview}></Image>
<Image id={item.image_id} className={styles.imagePreview}></Image>
} }
> >
<Image id={item.img_id} className={styles.image}></Image>
<Image id={item.image_id} className={styles.image}></Image>
</Popover> </Popover>
)} )}



Loading…
Peruuta
Tallenna