| from unstructured.partition.ppt import partition_ppt | from unstructured.partition.ppt import partition_ppt | ||||
| elements = partition_ppt(filename=self._file_path, api_url=self._api_url) | elements = partition_ppt(filename=self._file_path, api_url=self._api_url) | ||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| text_by_page = {} | |||||
| for element in elements: | |||||
| page = element.metadata.page_number | |||||
| text = element.text | |||||
| if page in text_by_page: | |||||
| text_by_page[page] += "\n" + text | |||||
| else: | |||||
| text_by_page[page] = text | |||||
| combined_texts = list(text_by_page.values()) | |||||
| documents = [] | documents = [] | ||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| for combined_text in combined_texts: | |||||
| text = combined_text.strip() | |||||
| documents.append(Document(page_content=text)) | documents.append(Document(page_content=text)) | ||||
| return documents | return documents |
| from unstructured.partition.pptx import partition_pptx | from unstructured.partition.pptx import partition_pptx | ||||
| elements = partition_pptx(filename=self._file_path, api_url=self._api_url) | elements = partition_pptx(filename=self._file_path, api_url=self._api_url) | ||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| text_by_page = {} | |||||
| for element in elements: | |||||
| page = element.metadata.page_number | |||||
| text = element.text | |||||
| if page in text_by_page: | |||||
| text_by_page[page] += "\n" + text | |||||
| else: | |||||
| text_by_page[page] = text | |||||
| combined_texts = list(text_by_page.values()) | |||||
| documents = [] | documents = [] | ||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| for combined_text in combined_texts: | |||||
| text = combined_text.strip() | |||||
| documents.append(Document(page_content=text)) | documents.append(Document(page_content=text)) | ||||
| return documents | return documents |
| hash = helper.generate_text_hash(document_node.page_content) | hash = helper.generate_text_hash(document_node.page_content) | ||||
| document_node.metadata['doc_id'] = doc_id | document_node.metadata['doc_id'] = doc_id | ||||
| document_node.metadata['doc_hash'] = hash | document_node.metadata['doc_hash'] = hash | ||||
| # delete Spliter character | |||||
| page_content = document_node.page_content | |||||
| if page_content.startswith(".") or page_content.startswith("。"): | |||||
| page_content = page_content[1:] | |||||
| else: | |||||
| page_content = page_content | |||||
| document_node.page_content = page_content | |||||
| split_documents.append(document_node) | split_documents.append(document_node) | ||||
| all_documents.extend(split_documents) | all_documents.extend(split_documents) | ||||
| # processing qa document | # processing qa document |