Browse Source

Fix: resolve regex library warnings (#7782)

### What problem does this PR solve?
This small PR resolves the regex library warnings showing in Python3.11:
```python
DeprecationWarning: 'count' is passed as positional argument
```

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
tags/v0.19.x
Emmanuel Ferdman 5 months ago
parent
commit
d4a123d6dd
No account linked to committer's email address

+ 3
- 3
deepdoc/parser/resume/entities/corporations.py View File

nm = re.sub(r"&amp;", "&", nm) nm = re.sub(r"&amp;", "&", nm)
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm) nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
nm = re.sub( nm = re.sub(
r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE
r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, count=10000, flags=re.IGNORECASE
) )
nm = re.sub( nm = re.sub(
r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
"", "",
nm, nm,
10000,
re.IGNORECASE,
count=10000,
flags=re.IGNORECASE,
) )
if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])): if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
return nm return nm

+ 2
- 2
deepdoc/parser/resume/step_two.py View File





def rmHtmlTag(line): def rmHtmlTag(line):
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE)




def highest_degree(dg): def highest_degree(dg):
(r".*国有.*", "国企"), (r".*国有.*", "国企"),
(r"[ ()\(\)人/·0-9-]+", ""), (r"[ ()\(\)人/·0-9-]+", ""),
(r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]: (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE)
if len(cv["corporation_type"]) < 2: if len(cv["corporation_type"]) < 2:
del cv["corporation_type"] del cv["corporation_type"]



+ 1
- 1
rag/nlp/__init__.py View File

type("")) else sections[i][0]).strip() type("")) else sections[i][0]).strip()


if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], flags=re.IGNORECASE)):
i += 1 i += 1
continue continue
sections.pop(i) sections.pop(i)

Loading…
Cancel
Save