4 months ago · 3e7f8bad56
--- a/api/core/rag/extractor/markdown_extractor.py
+++ b/api/core/rag/extractor/markdown_extractor.py
@@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
                continue
            header_match = re.match(r"^#+\s", line)
            if header_match:
                if current_header is not None:
                    markdown_tups.append((current_header, current_text))

                markdown_tups.append((current_header, current_text))
                current_header = line
                current_text = ""
            else:
                current_text += line + "\n"
        markdown_tups.append((current_header, current_text))

        if current_header is not None:
            # pass linting, assert keys are defined
            markdown_tups = [
                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
            ]
        else:
            markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
        markdown_tups = [
            (re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
            for key, value in markdown_tups
        ]

        return markdown_tups

--- a/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py
+++ b/api/tests/unit_tests/core/rag/extractor/test_markdown_extractor.py
@@ -0,0 +1,22 @@
 from core.rag.extractor.markdown_extractor import MarkdownExtractor


 def test_markdown_to_tups():
    markdown = """
 this is some text without header

 # title 1
 this is balabala text

 ## title 2
 this is more specific text.
        """
    extractor = MarkdownExtractor(file_path="dummy_path")
    updated_output = extractor.markdown_to_tups(markdown)
    assert len(updated_output) == 3
    key, header_value = updated_output[0]
    assert key == None
    assert header_value.strip() == "this is some text without header"
    title_1, value = updated_output[1]
    assert title_1.strip() == "title 1"
    assert value.strip() == "this is balabala text"