### What problem does this PR solve? feat: Added explanation on the parsing method of knowledge graph #1594 ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.10.0
| 'one', | 'one', | ||||
| 'qa', | 'qa', | ||||
| 'manual', | 'manual', | ||||
| 'knowledge_graph' | |||||
| 'knowledge_graph', | |||||
| ], | ], | ||||
| ], | ], | ||||
| [ | [ | ||||
| ], | ], | ||||
| [['md'], ['naive', 'qa', 'knowledge_graph']], | [['md'], ['naive', 'qa', 'knowledge_graph']], | ||||
| [['json'], ['naive', 'knowledge_graph']], | [['json'], ['naive', 'knowledge_graph']], | ||||
| [['eml'], ['email']] | |||||
| [['eml'], ['email']], | |||||
| ]); | ]); | ||||
| const getParserList = ( | const getParserList = ( |
| We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents. | We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents. | ||||
| So, the figures and tables in the same section will not be sliced apart, and chunk size might be large. | So, the figures and tables in the same section will not be sliced apart, and chunk size might be large. | ||||
| </p>`, | </p>`, | ||||
| naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT</b>.</p> | |||||
| naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>.</p> | |||||
| <p>This method apply the naive ways to chunk files: </p> | <p>This method apply the naive ways to chunk files: </p> | ||||
| <p> | <p> | ||||
| <li>Successive text will be sliced into pieces using vision detection model.</li> | <li>Successive text will be sliced into pieces using vision detection model.</li> | ||||
| </p><p> | </p><p> | ||||
| If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method. | If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method. | ||||
| </p>`, | </p>`, | ||||
| knowledgeGraph: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b> | |||||
| <p>After files being chunked, it uses chunks to extract knowledge graph and mind map of the entire document. This method apply the naive ways to chunk files: | |||||
| Successive text will be sliced into pieces each of which is around 512 token number.</p> | |||||
| <p>Next, chunks will be transmited to LLM to extract nodes and relationships of a knowledge graph, and a mind map.</p> | |||||
| Mind the entiry type you need to specify.</p>`, | |||||
| useRaptor: 'Use RAPTOR to enhance retrieval', | useRaptor: 'Use RAPTOR to enhance retrieval', | ||||
| useRaptorTip: | useRaptorTip: | ||||
| 'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059', | 'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059', |
| 我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。 | 我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。 | ||||
| 因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。 | 因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。 | ||||
| </p>`, | </p>`, | ||||
| naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p> | |||||
| naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p> | |||||
| <p>此方法將簡單的方法應用於塊文件:</p> | <p>此方法將簡單的方法應用於塊文件:</p> | ||||
| <p> | <p> | ||||
| <li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li> | <li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li> | ||||
| </p><p> | </p><p> | ||||
| 如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。 | 如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。 | ||||
| </p>`, | </p>`, | ||||
| knowledgeGraph: `<p>支援的檔案格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b> | |||||
| <p>文件分塊後,使用分塊擷取整個文件的知識圖譜和心智圖。此方法將簡單的方法應用於區塊檔案: | |||||
| 連續的文字將被分割成多個片段,每個片段大約有 512 個令牌數。 | |||||
| <p>接下來,區塊將傳送到LLM以提取知識圖譜和思維導圖的節點和關係。 | |||||
| <p>請注意您需要指定的條目類型。</p></p>`, | |||||
| useRaptor: '使用RAPTOR文件增強策略', | useRaptor: '使用RAPTOR文件增強策略', | ||||
| useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059', | useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059', | ||||
| prompt: '提示詞', | prompt: '提示詞', |
| 我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。 | 我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。 | ||||
| 因此,同一部分中的图和表不会被分割,并且块大小可能会很大。 | 因此,同一部分中的图和表不会被分割,并且块大小可能会很大。 | ||||
| </p>`, | </p>`, | ||||
| naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p> | |||||
| naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p> | |||||
| <p>此方法将简单的方法应用于块文件:</p> | <p>此方法将简单的方法应用于块文件:</p> | ||||
| <p> | <p> | ||||
| <li>系统将使用视觉检测模型将连续文本分割成多个片段。</li> | <li>系统将使用视觉检测模型将连续文本分割成多个片段。</li> | ||||
| </p><p> | </p><p> | ||||
| 如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。 | 如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。 | ||||
| </p>`, | </p>`, | ||||
| knowledgeGraph: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b> | |||||
| <p>文件分块后,使用分块提取整个文档的知识图谱和思维导图。此方法将简单的方法应用于分块文件: | |||||
| 连续的文本将被切成大约 512 个 token 数的块。</p> | |||||
| <p>接下来,将分块传输到 LLM 以提取知识图谱和思维导图的节点和关系。</p> | |||||
| 注意您需要指定的条目类型。</p>`, | |||||
| useRaptor: '使用召回增强RAPTOR策略', | useRaptor: '使用召回增强RAPTOR策略', | ||||
| useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059', | useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059', | ||||
| prompt: '提示词', | prompt: '提示词', |
| import { useSelectParserList } from '@/hooks/user-setting-hooks'; | import { useSelectParserList } from '@/hooks/user-setting-hooks'; | ||||
| import { Col, Divider, Empty, Row, Typography } from 'antd'; | import { Col, Divider, Empty, Row, Typography } from 'antd'; | ||||
| import DOMPurify from 'dompurify'; | import DOMPurify from 'dompurify'; | ||||
| import camelCase from 'lodash/camelCase'; | |||||
| import { useMemo } from 'react'; | import { useMemo } from 'react'; | ||||
| import styles from './index.less'; | import styles from './index.less'; | ||||
| import { ImageMap } from './utils'; | import { ImageMap } from './utils'; | ||||
| if (item) { | if (item) { | ||||
| return { | return { | ||||
| title: item.label, | title: item.label, | ||||
| description: t(item.value), | |||||
| description: t(camelCase(item.value)), | |||||
| }; | }; | ||||
| } | } | ||||
| return { title: '', description: '' }; | return { title: '', description: '' }; |
| }; | }; | ||||
| }; | }; | ||||
| // The value that does not need to be displayed in the analysis method Select | |||||
| const HiddenFields = ['email', 'picture', 'audio']; | |||||
| export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => { | export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => { | ||||
| const parserList = useSelectParserList(); | const parserList = useSelectParserList(); | ||||
| const allOptions = useSelectLlmOptionsByModelType(); | const allOptions = useSelectLlmOptionsByModelType(); | ||||
| }, [form, knowledgeDetails]); | }, [form, knowledgeDetails]); | ||||
| return { | return { | ||||
| parserList, | |||||
| parserList: parserList.filter( | |||||
| (x) => !HiddenFields.some((y) => y === x.value), | |||||
| ), | |||||
| embeddingModelOptions: allOptions[LlmModelType.Embedding], | embeddingModelOptions: allOptions[LlmModelType.Embedding], | ||||
| disabled: knowledgeDetails.chunk_num > 0, | disabled: knowledgeDetails.chunk_num > 0, | ||||
| }; | }; |
| resume: getImageName('resume', 2), | resume: getImageName('resume', 2), | ||||
| table: getImageName('table', 2), | table: getImageName('table', 2), | ||||
| one: getImageName('one', 2), | one: getImageName('one', 2), | ||||
| knowledge_graph: getImageName('knowledge-graph', 2), | |||||
| }; | }; | ||||
| export const TextMap = { | export const TextMap = { |