Browse Source

fix: markdown_extractor lost chunks if it starts without a header(#21308) (#21309)

tags/1.5.0
Jin 4 months ago
parent
commit
3e7f8bad56
No account linked to committer's email address

+ 5
- 10
api/core/rag/extractor/markdown_extractor.py View File

@@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
continue
header_match = re.match(r"^#+\s", line)
if header_match:
if current_header is not None:
markdown_tups.append((current_header, current_text))

markdown_tups.append((current_header, current_text))
current_header = line
current_text = ""
else:
current_text += line + "\n"
markdown_tups.append((current_header, current_text))

if current_header is not None:
# pass linting, assert keys are defined
markdown_tups = [
(re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
]
else:
markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
markdown_tups = [
(re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
for key, value in markdown_tups
]

return markdown_tups


+ 22
- 0
api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py View File

@@ -0,0 +1,22 @@
from core.rag.extractor.markdown_extractor import MarkdownExtractor


def test_markdown_to_tups():
markdown = """
this is some text without header

# title 1
this is balabala text

## title 2
this is more specific text.
"""
extractor = MarkdownExtractor(file_path="dummy_path")
updated_output = extractor.markdown_to_tups(markdown)
assert len(updated_output) == 3
key, header_value = updated_output[0]
assert key == None
assert header_value.strip() == "this is some text without header"
title_1, value = updated_output[1]
assert title_1.strip() == "title 1"
assert value.strip() == "this is balabala text"

Loading…
Cancel
Save