Bladeren bron

fix: document truncation and loss in notion document sync (#5631)

Co-authored-by: Aurelius Huang <cm.huang@aftership.com>
tags/0.6.13
Aurelius Huang 1 jaar geleden
bovenliggende
commit
f546db5437
No account linked to committer's email address
1 gewijzigde bestanden met toevoegingen van 15 en 16 verwijderingen
  1. 15
    16
      api/core/rag/extractor/notion_extractor.py

+ 15
- 16
api/core/rag/extractor/notion_extractor.py Bestand weergeven



def _get_notion_block_data(self, page_id: str) -> list[str]: def _get_notion_block_data(self, page_id: str) -> list[str]:
result_lines_arr = [] result_lines_arr = []
cur_block_id = page_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=page_id)
while True: while True:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}

query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
res = requests.request( res = requests.request(
"GET", "GET",
block_url, block_url,
"Content-Type": "application/json", "Content-Type": "application/json",
"Notion-Version": "2022-06-28", "Notion-Version": "2022-06-28",
}, },
json=query_dict
params=query_dict
) )
data = res.json() data = res.json()
for result in data["results"]: for result in data["results"]:
if data["next_cursor"] is None: if data["next_cursor"] is None:
break break
else: else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]
return result_lines_arr return result_lines_arr


def _read_block(self, block_id: str, num_tabs: int = 0) -> str: def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
"""Read a block.""" """Read a block."""
result_lines_arr = [] result_lines_arr = []
cur_block_id = block_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while True: while True:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}


res = requests.request( res = requests.request(
"GET", "GET",
"Content-Type": "application/json", "Content-Type": "application/json",
"Notion-Version": "2022-06-28", "Notion-Version": "2022-06-28",
}, },
json=query_dict
params=query_dict
) )
data = res.json() data = res.json()
if 'results' not in data or data["results"] is None: if 'results' not in data or data["results"] is None:
if data["next_cursor"] is None: if data["next_cursor"] is None:
break break
else: else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]


result_lines = "\n".join(result_lines_arr) result_lines = "\n".join(result_lines_arr)
return result_lines return result_lines
"""Read table rows.""" """Read table rows."""
done = False done = False
result_lines_arr = [] result_lines_arr = []
cur_block_id = block_id
start_cursor = None
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
while not done: while not done:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: dict[str, Any] = {}
query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}


res = requests.request( res = requests.request(
"GET", "GET",
"Content-Type": "application/json", "Content-Type": "application/json",
"Notion-Version": "2022-06-28", "Notion-Version": "2022-06-28",
}, },
json=query_dict
params=query_dict
) )
data = res.json() data = res.json()
# get table headers text # get table headers text
done = True done = True
break break
else: else:
cur_block_id = data["next_cursor"]
start_cursor = data["next_cursor"]


result_lines = "\n".join(result_lines_arr) result_lines = "\n".join(result_lines_arr)
return result_lines return result_lines

Laden…
Annuleren
Opslaan