|
|
|
@@ -15,7 +15,7 @@ from rag.nlp import find_codec,num_tokens_from_string |
|
|
|
import re |
|
|
|
|
|
|
|
class RAGFlowTxtParser: |
|
|
|
def __call__(self, fnm, binary=None, chunk_token_num=128): |
|
|
|
def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"): |
|
|
|
txt = "" |
|
|
|
if binary: |
|
|
|
encoding = find_codec(binary) |
|
|
|
@@ -27,7 +27,7 @@ class RAGFlowTxtParser: |
|
|
|
if not l: |
|
|
|
break |
|
|
|
txt += l |
|
|
|
return self.parser_txt(txt, chunk_token_num) |
|
|
|
return self.parser_txt(txt, chunk_token_num, delimiter) |
|
|
|
|
|
|
|
@classmethod |
|
|
|
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"): |