### What problem does this PR solve? feat: Added explanation on the parsing method of knowledge graph #1594 ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.10.0
| @@ -27,7 +27,7 @@ const ParserListMap = new Map([ | |||
| 'one', | |||
| 'qa', | |||
| 'manual', | |||
| 'knowledge_graph' | |||
| 'knowledge_graph', | |||
| ], | |||
| ], | |||
| [ | |||
| @@ -67,7 +67,7 @@ const ParserListMap = new Map([ | |||
| ], | |||
| [['md'], ['naive', 'qa', 'knowledge_graph']], | |||
| [['json'], ['naive', 'knowledge_graph']], | |||
| [['eml'], ['email']] | |||
| [['eml'], ['email']], | |||
| ]); | |||
| const getParserList = ( | |||
| @@ -199,7 +199,7 @@ export default { | |||
| We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents. | |||
| So, the figures and tables in the same section will not be sliced apart, and chunk size might be large. | |||
| </p>`, | |||
| naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT</b>.</p> | |||
| naive: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>.</p> | |||
| <p>This method apply the naive ways to chunk files: </p> | |||
| <p> | |||
| <li>Successive text will be sliced into pieces using vision detection model.</li> | |||
| @@ -271,6 +271,13 @@ export default { | |||
| </p><p> | |||
| If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method. | |||
| </p>`, | |||
| knowledgeGraph: `<p>Supported file formats are <b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b> | |||
| <p>After files being chunked, it uses chunks to extract knowledge graph and mind map of the entire document. This method apply the naive ways to chunk files: | |||
| Successive text will be sliced into pieces each of which is around 512 token number.</p> | |||
| <p>Next, chunks will be transmited to LLM to extract nodes and relationships of a knowledge graph, and a mind map.</p> | |||
| Mind the entiry type you need to specify.</p>`, | |||
| useRaptor: 'Use RAPTOR to enhance retrieval', | |||
| useRaptorTip: | |||
| 'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059', | |||
| @@ -190,7 +190,7 @@ export default { | |||
| 我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。 | |||
| 因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。 | |||
| </p>`, | |||
| naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p> | |||
| naive: `<p>支持的文件格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p> | |||
| <p>此方法將簡單的方法應用於塊文件:</p> | |||
| <p> | |||
| <li>系統將使用視覺檢測模型將連續文本分割成多個片段。</li> | |||
| @@ -244,6 +244,13 @@ export default { | |||
| </p><p> | |||
| 如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。 | |||
| </p>`, | |||
| knowledgeGraph: `<p>支援的檔案格式為<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b> | |||
| <p>文件分塊後,使用分塊擷取整個文件的知識圖譜和心智圖。此方法將簡單的方法應用於區塊檔案: | |||
| 連續的文字將被分割成多個片段,每個片段大約有 512 個令牌數。 | |||
| <p>接下來,區塊將傳送到LLM以提取知識圖譜和思維導圖的節點和關係。 | |||
| <p>請注意您需要指定的條目類型。</p></p>`, | |||
| useRaptor: '使用RAPTOR文件增強策略', | |||
| useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059', | |||
| prompt: '提示詞', | |||
| @@ -191,7 +191,7 @@ export default { | |||
| 我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。 | |||
| 因此,同一部分中的图和表不会被分割,并且块大小可能会很大。 | |||
| </p>`, | |||
| naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT</b>。</p> | |||
| naive: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b>。</p> | |||
| <p>此方法将简单的方法应用于块文件:</p> | |||
| <p> | |||
| <li>系统将使用视觉检测模型将连续文本分割成多个片段。</li> | |||
| @@ -261,6 +261,13 @@ export default { | |||
| </p><p> | |||
| 如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。 | |||
| </p>`, | |||
| knowledgeGraph: `<p>支持的文件格式为<b>DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML</b> | |||
| <p>文件分块后,使用分块提取整个文档的知识图谱和思维导图。此方法将简单的方法应用于分块文件: | |||
| 连续的文本将被切成大约 512 个 token 数的块。</p> | |||
| <p>接下来,将分块传输到 LLM 以提取知识图谱和思维导图的节点和关系。</p> | |||
| 注意您需要指定的条目类型。</p>`, | |||
| useRaptor: '使用召回增强RAPTOR策略', | |||
| useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059', | |||
| prompt: '提示词', | |||
| @@ -3,6 +3,7 @@ import { useTranslate } from '@/hooks/common-hooks'; | |||
| import { useSelectParserList } from '@/hooks/user-setting-hooks'; | |||
| import { Col, Divider, Empty, Row, Typography } from 'antd'; | |||
| import DOMPurify from 'dompurify'; | |||
| import camelCase from 'lodash/camelCase'; | |||
| import { useMemo } from 'react'; | |||
| import styles from './index.less'; | |||
| import { ImageMap } from './utils'; | |||
| @@ -18,7 +19,7 @@ const CategoryPanel = ({ chunkMethod }: { chunkMethod: string }) => { | |||
| if (item) { | |||
| return { | |||
| title: item.label, | |||
| description: t(item.value), | |||
| description: t(camelCase(item.value)), | |||
| }; | |||
| } | |||
| return { title: '', description: '' }; | |||
| @@ -37,6 +37,9 @@ export const useSubmitKnowledgeConfiguration = (form: FormInstance) => { | |||
| }; | |||
| }; | |||
| // The value that does not need to be displayed in the analysis method Select | |||
| const HiddenFields = ['email', 'picture', 'audio']; | |||
| export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => { | |||
| const parserList = useSelectParserList(); | |||
| const allOptions = useSelectLlmOptionsByModelType(); | |||
| @@ -62,7 +65,9 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => { | |||
| }, [form, knowledgeDetails]); | |||
| return { | |||
| parserList, | |||
| parserList: parserList.filter( | |||
| (x) => !HiddenFields.some((y) => y === x.value), | |||
| ), | |||
| embeddingModelOptions: allOptions[LlmModelType.Embedding], | |||
| disabled: knowledgeDetails.chunk_num > 0, | |||
| }; | |||
| @@ -15,6 +15,7 @@ export const ImageMap = { | |||
| resume: getImageName('resume', 2), | |||
| table: getImageName('table', 2), | |||
| one: getImageName('one', 2), | |||
| knowledge_graph: getImageName('knowledge-graph', 2), | |||
| }; | |||
| export const TextMap = { | |||