* init README of deepdoc, add picture processer. * add resume parsingtags/v0.1.0
| hudet/ | hudet/ | ||||
| cv/ | cv/ | ||||
| layout_app.py | layout_app.py | ||||
| resume/ | |||||
| api/flask_session | |||||
| # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries | ||||
| # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html |
| del req["conversation_id"] | del req["conversation_id"] | ||||
| del req["messages"] | del req["messages"] | ||||
| ans = chat(dia, msg, **req) | ans = chat(dia, msg, **req) | ||||
| if not conv.reference: conv.reference = [] | |||||
| conv.reference.append(ans["reference"]) | conv.reference.append(ans["reference"]) | ||||
| conv.message.append({"role": "assistant", "content": ans["answer"]}) | conv.message.append({"role": "assistant", "content": ans["answer"]}) | ||||
| ConversationService.update_by_id(conv.id, conv.to_dict()) | ConversationService.update_by_id(conv.id, conv.to_dict()) |
| dialog_id = req.get("dialog_id") | dialog_id = req.get("dialog_id") | ||||
| name = req.get("name", "New Dialog") | name = req.get("name", "New Dialog") | ||||
| description = req.get("description", "A helpful Dialog") | description = req.get("description", "A helpful Dialog") | ||||
| language = req.get("language", "Chinese") | |||||
| top_n = req.get("top_n", 6) | top_n = req.get("top_n", 6) | ||||
| similarity_threshold = req.get("similarity_threshold", 0.1) | similarity_threshold = req.get("similarity_threshold", 0.1) | ||||
| vector_similarity_weight = req.get("vector_similarity_weight", 0.3) | vector_similarity_weight = req.get("vector_similarity_weight", 0.3) | ||||
| "name": name, | "name": name, | ||||
| "kb_ids": req["kb_ids"], | "kb_ids": req["kb_ids"], | ||||
| "description": description, | "description": description, | ||||
| "language": language, | |||||
| "llm_id": llm_id, | "llm_id": llm_id, | ||||
| "llm_setting": llm_setting, | "llm_setting": llm_setting, | ||||
| "prompt_config": prompt_config, | "prompt_config": prompt_config, |
| response = flask.make_response(MINIO.get(doc.kb_id, doc.location)) | response = flask.make_response(MINIO.get(doc.kb_id, doc.location)) | ||||
| ext = re.search(r"\.([^.]+)$", doc.name) | ext = re.search(r"\.([^.]+)$", doc.name) | ||||
| if ext: | if ext: | ||||
| response.headers.set('Content-Type', 'application/%s'%ext.group(1)) | |||||
| if doc.type == FileType.VISUAL.value: | |||||
| response.headers.set('Content-Type', 'image/%s'%ext.group(1)) | |||||
| else: response.headers.set('Content-Type', 'application/%s'%ext.group(1)) | |||||
| return response | return response | ||||
| except Exception as e: | except Exception as e: | ||||
| return server_error_response(e) | return server_error_response(e) |
| avatar = TextField(null=True, help_text="avatar base64 string") | avatar = TextField(null=True, help_text="avatar base64 string") | ||||
| tenant_id = CharField(max_length=32, null=False) | tenant_id = CharField(max_length=32, null=False) | ||||
| name = CharField(max_length=128, null=False, help_text="KB name", index=True) | name = CharField(max_length=128, null=False, help_text="KB name", index=True) | ||||
| language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese") | |||||
| description = TextField(null=True, help_text="KB description") | description = TextField(null=True, help_text="KB description") | ||||
| embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID") | embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID") | ||||
| permission = CharField(max_length=16, null=False, help_text="me|team", default="me") | permission = CharField(max_length=16, null=False, help_text="me|team", default="me") |
| @classmethod | @classmethod | ||||
| @DB.connection_context() | @DB.connection_context() | ||||
| def model_instance(cls, tenant_id, llm_type, llm_name=None): | |||||
| def model_instance(cls, tenant_id, llm_type, llm_name=None, lang="Chinese"): | |||||
| e, tenant = TenantService.get_by_id(tenant_id) | e, tenant = TenantService.get_by_id(tenant_id) | ||||
| if not e: | if not e: | ||||
| raise LookupError("Tenant not found") | raise LookupError("Tenant not found") | ||||
| if model_config["llm_factory"] not in CvModel: | if model_config["llm_factory"] not in CvModel: | ||||
| return | return | ||||
| return CvModel[model_config["llm_factory"]]( | return CvModel[model_config["llm_factory"]]( | ||||
| model_config["api_key"], model_config["llm_name"]) | |||||
| model_config["api_key"], model_config["llm_name"], lang) | |||||
| if llm_type == LLMType.CHAT.value: | if llm_type == LLMType.CHAT.value: | ||||
| if model_config["llm_factory"] not in ChatModel: | if model_config["llm_factory"] not in ChatModel: | ||||
| class LLMBundle(object): | class LLMBundle(object): | ||||
| def __init__(self, tenant_id, llm_type, llm_name=None): | |||||
| def __init__(self, tenant_id, llm_type, llm_name=None, lang="Chinese"): | |||||
| self.tenant_id = tenant_id | self.tenant_id = tenant_id | ||||
| self.llm_type = llm_type | self.llm_type = llm_type | ||||
| self.llm_name = llm_name | self.llm_name = llm_name | ||||
| self.mdl = TenantLLMService.model_instance(tenant_id, llm_type, llm_name) | |||||
| self.mdl = TenantLLMService.model_instance(tenant_id, llm_type, llm_name, lang=lang) | |||||
| assert self.mdl, "Can't find mole for {}/{}/{}".format(tenant_id, llm_type, llm_name) | assert self.mdl, "Can't find mole for {}/{}/{}".format(tenant_id, llm_type, llm_name) | ||||
| def encode(self, texts: list, batch_size=32): | def encode(self, texts: list, batch_size=32): |
| @classmethod | @classmethod | ||||
| @DB.connection_context() | @DB.connection_context() | ||||
| def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64): | def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64): | ||||
| fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.parser_config, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time] | |||||
| fields = [ | |||||
| cls.model.id, | |||||
| cls.model.doc_id, | |||||
| cls.model.from_page, | |||||
| cls.model.to_page, | |||||
| Document.kb_id, | |||||
| Document.parser_id, | |||||
| Document.parser_config, | |||||
| Document.name, | |||||
| Document.type, | |||||
| Document.location, | |||||
| Document.size, | |||||
| Knowledgebase.tenant_id, | |||||
| Knowledgebase.language, | |||||
| Tenant.embd_id, | |||||
| Tenant.img2txt_id, | |||||
| Tenant.asr_id, | |||||
| cls.model.update_time] | |||||
| docs = cls.model.select(*fields) \ | docs = cls.model.select(*fields) \ | ||||
| .join(Document, on=(cls.model.doc_id == Document.id)) \ | .join(Document, on=(cls.model.doc_id == Document.id)) \ | ||||
| .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \ | .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \ | ||||
| .paginate(1, items_per_page) | .paginate(1, items_per_page) | ||||
| return list(docs.dicts()) | return list(docs.dicts()) | ||||
| @classmethod | @classmethod | ||||
| @DB.connection_context() | @DB.connection_context() | ||||
| def do_cancel(cls, id): | def do_cancel(cls, id): | ||||
| pass | pass | ||||
| return True | return True | ||||
| @classmethod | @classmethod | ||||
| @DB.connection_context() | @DB.connection_context() | ||||
| def update_progress(cls, id, info): | def update_progress(cls, id, info): | ||||
| cls.model.update(progress_msg=cls.model.progress_msg + "\n"+info["progress_msg"]).where( | |||||
| cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where( | |||||
| cls.model.id == id).execute() | cls.model.id == id).execute() | ||||
| if "progress" in info: | if "progress" in info: | ||||
| cls.model.update(progress=info["progress"]).where( | cls.model.update(progress=info["progress"]).where( | ||||
| cls.model.id == id).execute() | |||||
| cls.model.id == id).execute() |
| return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8") | return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8") | ||||
| if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): | if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): | ||||
| return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()).decode("utf-8") | |||||
| image = Image.open(BytesIO(blob)) | |||||
| image.thumbnail((30, 30)) | |||||
| buffered = BytesIO() | |||||
| image.save(buffered, format="png") | |||||
| return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8") | |||||
| if re.match(r".*\.(ppt|pptx)$", filename): | if re.match(r".*\.(ppt|pptx)$", filename): | ||||
| import aspose.slides as slides | import aspose.slides as slides |
| English | [简体中文](./README_zh.md) | |||||
| #*Deep*Doc | |||||
| --- | |||||
| - [1. Introduction](#1) | |||||
| - [2. Vision](#2) | |||||
| - [3. Parser](#3) | |||||
| <a name="1"></a> | |||||
| ## 1. Introduction | |||||
| --- | |||||
| With a bunch of documents from various domains with various formats and along with diverse retrieval requirements, | |||||
| an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose. | |||||
| There 2 parts in *Deep*Doc so far: vision and parser. | |||||
| <a name="2"></a> | |||||
| ## 2. Vision | |||||
| --- | |||||
| We use vision information to resolve problems as human being. | |||||
| - OCR. Since a lot of documents presented as images or at least be able to transform to image, | |||||
| OCR is a very essential and fundamental or even universal solution for text extraction. | |||||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||||
| <img src="https://lh6.googleusercontent.com/2xdiSjaGWkZ71YdORc71Ujf7jCHmO6G-6ONklzGiUYEh3QZpjPo6MQ9eqEFX20am_cdW4Ck0YRraXEetXWnM08kJd99yhik13Cy0_YKUAq2zVGR15LzkovRAmK9iT4o3hcJ8dTpspaJKUwt6R4gN7So" width="300"/> | |||||
| </div> | |||||
| - Layout recognition. Documents from different domain may have various layouts, | |||||
| like, newspaper, magazine, book and résumé are distinct in terms of layout. | |||||
| Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not, | |||||
| or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption. | |||||
| We have 10 basic layout components which covers most cases: | |||||
| - Text | |||||
| - Title | |||||
| - Figure | |||||
| - Figure caption | |||||
| - Table | |||||
| - Table caption | |||||
| - Header | |||||
| - Footer | |||||
| - Reference | |||||
| - Equation | |||||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||||
| <img src="https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/ppstructure/docs/layout/layout.png?raw=true" width="900"/> | |||||
| </div> | |||||
| - Table Structure Recognition(TSR). Data table is a frequently used structure present data including numbers or text. | |||||
| And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers. | |||||
| Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM. | |||||
| We have five labels for TSR task: | |||||
| - Column | |||||
| - Row | |||||
| - Column header | |||||
| - Projected row header | |||||
| - Spanning cell | |||||
| <div align="center" style="margin-top:20px;margin-bottom:20px;"> | |||||
| <img src="https://user-images.githubusercontent.com/10793386/139559159-cd23c972-8731-48ed-91df-f3f27e9f4d79.jpg" width="900"/> | |||||
| </div> | |||||
| <a name="3"></a> | |||||
| ## 3. Parser | |||||
| --- | |||||
| Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser. | |||||
| The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes: | |||||
| - Text chunks with their own positions in PDF(page number and rectangular positions). | |||||
| - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences. | |||||
| - Figures with caption and text in the figures. | |||||
| ###Résumé | |||||
| --- | |||||
| The résumé is a very complicated kind of document. A résumé which is composed of unstructured text | |||||
| with various layouts could be resolved into structured data composed of nearly a hundred of fields. | |||||
| We haven't opened the parser yet, as we open the processing method after parsing procedure. | |||||
| [English](./README.md) | 简体中文 |
| import random | |||||
| from .pdf_parser import HuParser as PdfParser | from .pdf_parser import HuParser as PdfParser | ||||
| from .docx_parser import HuDocxParser as DocxParser | from .docx_parser import HuDocxParser as DocxParser | ||||
| from .excel_parser import HuExcelParser as ExcelParser | from .excel_parser import HuExcelParser as ExcelParser | ||||
| import re | |||||
| from nltk import word_tokenize | |||||
| from rag.nlp import stemmer, huqie | |||||
| from rag.utils import num_tokens_from_string | |||||
| BULLET_PATTERN = [[ | |||||
| r"第[零一二三四五六七八九十百0-9]+(分?编|部分)", | |||||
| r"第[零一二三四五六七八九十百0-9]+章", | |||||
| r"第[零一二三四五六七八九十百0-9]+节", | |||||
| r"第[零一二三四五六七八九十百0-9]+条", | |||||
| r"[\((][零一二三四五六七八九十百]+[\))]", | |||||
| ], [ | |||||
| r"第[0-9]+章", | |||||
| r"第[0-9]+节", | |||||
| r"[0-9]{,3}[\. 、]", | |||||
| r"[0-9]{,2}\.[0-9]{,2}", | |||||
| r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", | |||||
| r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", | |||||
| ], [ | |||||
| r"第[零一二三四五六七八九十百0-9]+章", | |||||
| r"第[零一二三四五六七八九十百0-9]+节", | |||||
| r"[零一二三四五六七八九十百]+[ 、]", | |||||
| r"[\((][零一二三四五六七八九十百]+[\))]", | |||||
| r"[\((][0-9]{,2}[\))]", | |||||
| ], [ | |||||
| r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", | |||||
| r"Chapter (I+V?|VI*|XI|IX|X)", | |||||
| r"Section [0-9]+", | |||||
| r"Article [0-9]+" | |||||
| ] | |||||
| ] | |||||
| def random_choices(arr, k): | |||||
| k = min(len(arr), k) | |||||
| return random.choices(arr, k=k) | |||||
| def bullets_category(sections): | |||||
| global BULLET_PATTERN | |||||
| hits = [0] * len(BULLET_PATTERN) | |||||
| for i, pro in enumerate(BULLET_PATTERN): | |||||
| for sec in sections: | |||||
| for p in pro: | |||||
| if re.match(p, sec): | |||||
| hits[i] += 1 | |||||
| break | |||||
| maxium = 0 | |||||
| res = -1 | |||||
| for i, h in enumerate(hits): | |||||
| if h <= maxium: continue | |||||
| res = i | |||||
| maxium = h | |||||
| return res | |||||
| def is_english(texts): | |||||
| eng = 0 | |||||
| for t in texts: | |||||
| if re.match(r"[a-zA-Z]{2,}", t.strip()): | |||||
| eng += 1 | |||||
| if eng / len(texts) > 0.8: | |||||
| return True | |||||
| return False | |||||
| def tokenize(d, t, eng): | |||||
| d["content_with_weight"] = t | |||||
| if eng: | |||||
| t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) | |||||
| d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)]) | |||||
| else: | |||||
| d["content_ltks"] = huqie.qie(t) | |||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | |||||
| def remove_contents_table(sections, eng=False): | |||||
| i = 0 | |||||
| while i < len(sections): | |||||
| def get(i): | |||||
| nonlocal sections | |||||
| return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip() | |||||
| if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", | |||||
| re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)): | |||||
| i += 1 | |||||
| continue | |||||
| sections.pop(i) | |||||
| if i >= len(sections): break | |||||
| prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) | |||||
| while not prefix: | |||||
| sections.pop(i) | |||||
| if i >= len(sections): break | |||||
| prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) | |||||
| sections.pop(i) | |||||
| if i >= len(sections) or not prefix: break | |||||
| for j in range(i, min(i + 128, len(sections))): | |||||
| if not re.match(prefix, get(j)): | |||||
| continue | |||||
| for _ in range(i, j): sections.pop(i) | |||||
| break | |||||
| def make_colon_as_title(sections): | |||||
| if not sections: return [] | |||||
| if type(sections[0]) == type(""): return sections | |||||
| i = 0 | |||||
| while i < len(sections): | |||||
| txt, layout = sections[i] | |||||
| i += 1 | |||||
| txt = txt.split("@")[0].strip() | |||||
| if not txt: | |||||
| continue | |||||
| if txt[-1] not in "::": | |||||
| continue | |||||
| txt = txt[::-1] | |||||
| arr = re.split(r"([。?!!?;;]| .)", txt) | |||||
| if len(arr) < 2 or len(arr[1]) < 32: | |||||
| continue | |||||
| sections.insert(i - 1, (arr[0][::-1], "title")) | |||||
| i += 1 | |||||
| def hierarchical_merge(bull, sections, depth): | |||||
| if not sections or bull < 0: return [] | |||||
| if type(sections[0]) == type(""): sections = [(s, "") for s in sections] | |||||
| sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())] | |||||
| bullets_size = len(BULLET_PATTERN[bull]) | |||||
| levels = [[] for _ in range(bullets_size + 2)] | |||||
| def not_title(txt): | |||||
| if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False | |||||
| if len(txt) >= 128: return True | |||||
| return re.search(r"[,;,。;!!]", txt) | |||||
| for i, (txt, layout) in enumerate(sections): | |||||
| for j, p in enumerate(BULLET_PATTERN[bull]): | |||||
| if re.match(p, txt.strip()) and not not_title(txt): | |||||
| levels[j].append(i) | |||||
| break | |||||
| else: | |||||
| if re.search(r"(title|head)", layout): | |||||
| levels[bullets_size].append(i) | |||||
| else: | |||||
| levels[bullets_size + 1].append(i) | |||||
| sections = [t for t, _ in sections] | |||||
| for s in sections: print("--", s) | |||||
| def binary_search(arr, target): | |||||
| if not arr: return -1 | |||||
| if target > arr[-1]: return len(arr) - 1 | |||||
| if target < arr[0]: return -1 | |||||
| s, e = 0, len(arr) | |||||
| while e - s > 1: | |||||
| i = (e + s) // 2 | |||||
| if target > arr[i]: | |||||
| s = i | |||||
| continue | |||||
| elif target < arr[i]: | |||||
| e = i | |||||
| continue | |||||
| else: | |||||
| assert False | |||||
| return s | |||||
| cks = [] | |||||
| readed = [False] * len(sections) | |||||
| levels = levels[::-1] | |||||
| for i, arr in enumerate(levels[:depth]): | |||||
| for j in arr: | |||||
| if readed[j]: continue | |||||
| readed[j] = True | |||||
| cks.append([j]) | |||||
| if i + 1 == len(levels) - 1: continue | |||||
| for ii in range(i + 1, len(levels)): | |||||
| jj = binary_search(levels[ii], j) | |||||
| if jj < 0: continue | |||||
| if jj > cks[-1][-1]: cks[-1].pop(-1) | |||||
| cks[-1].append(levels[ii][jj]) | |||||
| for ii in cks[-1]: readed[ii] = True | |||||
| for i in range(len(cks)): | |||||
| cks[i] = [sections[j] for j in cks[i][::-1]] | |||||
| print("--------------\n", "\n* ".join(cks[i])) | |||||
| return cks | |||||
| def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): | |||||
| if not sections: return [] | |||||
| if type(sections[0]) == type(""): sections = [(s, "") for s in sections] | |||||
| cks = [""] | |||||
| tk_nums = [0] | |||||
| def add_chunk(t, pos): | |||||
| nonlocal cks, tk_nums, delimiter | |||||
| tnum = num_tokens_from_string(t) | |||||
| if tnum < 8: pos = "" | |||||
| if tk_nums[-1] > chunk_token_num: | |||||
| cks.append(t + pos) | |||||
| tk_nums.append(tnum) | |||||
| else: | |||||
| cks[-1] += t + pos | |||||
| tk_nums[-1] += tnum | |||||
| for sec, pos in sections: | |||||
| s, e = 0, 1 | |||||
| while e < len(sec): | |||||
| if sec[e] in delimiter: | |||||
| add_chunk(sec[s: e+1], pos) | |||||
| s = e + 1 | |||||
| e = s + 1 | |||||
| else: | |||||
| e += 1 | |||||
| if s < e: add_chunk(sec[s: e], pos) | |||||
| return cks | |||||
| from .ppt_parser import HuPptParser as PptParser | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from io import BytesIO | |||||
| from pptx import Presentation | |||||
| class HuPptParser(object): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| def __extract(self, shape): | |||||
| if shape.shape_type == 19: | |||||
| tb = shape.table | |||||
| rows = [] | |||||
| for i in range(1, len(tb.rows)): | |||||
| rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) | |||||
| return "\n".join(rows) | |||||
| if shape.has_text_frame: | |||||
| return shape.text_frame.text | |||||
| if shape.shape_type == 6: | |||||
| texts = [] | |||||
| for p in shape.shapes: | |||||
| t = self.__extract(p) | |||||
| if t: texts.append(t) | |||||
| return "\n".join(texts) | |||||
| def __call__(self, fnm, from_page, to_page, callback=None): | |||||
| ppt = Presentation(fnm) if isinstance( | |||||
| fnm, str) else Presentation( | |||||
| BytesIO(fnm)) | |||||
| txts = [] | |||||
| self.total_page = len(ppt.slides) | |||||
| for i, slide in enumerate(ppt.slides[from_page: to_page]): | |||||
| texts = [] | |||||
| for shape in slide.shapes: | |||||
| txt = self.__extract(shape) | |||||
| if txt: texts.append(txt) | |||||
| txts.append("\n".join(texts)) | |||||
| return txts |
| import datetime | |||||
| def refactor(cv): | |||||
| for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]: | |||||
| if n in cv and cv[n] is not None: del cv[n] | |||||
| cv["is_deleted"] = 0 | |||||
| if "basic" not in cv: cv["basic"] = {} | |||||
| if cv["basic"].get("photo2"): del cv["basic"]["photo2"] | |||||
| for n in ["education", "work", "certificate", "project", "language", "skill", "training"]: | |||||
| if n not in cv or cv[n] is None: continue | |||||
| if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()] | |||||
| if type(cv[n]) != type([]): | |||||
| del cv[n] | |||||
| continue | |||||
| vv = [] | |||||
| for v in cv[n]: | |||||
| if "external" in v and v["external"] is not None: del v["external"] | |||||
| vv.append(v) | |||||
| cv[n] = {str(i): vv[i] for i in range(len(vv))} | |||||
| basics = [ | |||||
| ("basic_salary_month", "salary_month"), | |||||
| ("expect_annual_salary_from", "expect_annual_salary"), | |||||
| ] | |||||
| for n, t in basics: | |||||
| if cv["basic"].get(n): | |||||
| cv["basic"][t] = cv["basic"][n] | |||||
| del cv["basic"][n] | |||||
| work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", "")) | |||||
| edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", "")) | |||||
| if work: | |||||
| cv["basic"]["work_start_time"] = work[0].get("start_time", "") | |||||
| cv["basic"]["management_experience"] = 'Y' if any( | |||||
| [w.get("management_experience", '') == 'Y' for w in work]) else 'N' | |||||
| cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0") | |||||
| for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities", | |||||
| "corporation_type", "scale", "corporation_name"]: | |||||
| cv["basic"][n] = work[-1].get(n, "") | |||||
| if edu: | |||||
| for n in ["school_name", "discipline_name"]: | |||||
| if n in edu[-1]: cv["basic"][n] = edu[-1][n] | |||||
| cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |||||
| if "contact" not in cv: cv["contact"] = {} | |||||
| if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "") | |||||
| return cv |
| import re,json,os | |||||
| import pandas as pd | |||||
| from rag.nlp import huqie | |||||
| from . import regions | |||||
| current_file_path = os.path.dirname(os.path.abspath(__file__)) | |||||
| GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0) | |||||
| GOODS["cid"] = GOODS["cid"].astype(str) | |||||
| GOODS = GOODS.set_index(["cid"]) | |||||
| CORP_TKS = json.load(open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r")) | |||||
| GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r")) | |||||
| CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r")) | |||||
| def baike(cid, default_v=0): | |||||
| global GOODS | |||||
| try: | |||||
| return GOODS.loc[str(cid), "len"] | |||||
| except Exception as e: | |||||
| pass | |||||
| return default_v | |||||
| def corpNorm(nm, add_region=True): | |||||
| global CORP_TKS | |||||
| if not nm or type(nm)!=type(""):return "" | |||||
| nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower() | |||||
| nm = re.sub(r"&", "&", nm) | |||||
| nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm) | |||||
| nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE) | |||||
| nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE) | |||||
| if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm | |||||
| tks = huqie.qie(nm).split(" ") | |||||
| reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)] | |||||
| nm = "" | |||||
| for t in tks: | |||||
| if regions.isName(t) or t in CORP_TKS:continue | |||||
| if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):nm += " " | |||||
| nm += t | |||||
| r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip()) | |||||
| if r:nm = r.group(1) | |||||
| r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip()) | |||||
| if r:nm = r.group(1) | |||||
| return nm.strip() + (("" if not reg else "(%s)"%reg[0]) if add_region else "") | |||||
| def rmNoise(n): | |||||
| n = re.sub(r"[\((][^()()]+[))]", "", n) | |||||
| n = re.sub(r"[,. &()()]+", "", n) | |||||
| return n | |||||
| GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP]) | |||||
| for c,v in CORP_TAG.items(): | |||||
| cc = corpNorm(rmNoise(c), False) | |||||
| if not cc: print (c) | |||||
| CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()} | |||||
| def is_good(nm): | |||||
| global GOOD_CORP | |||||
| if nm.find("外派")>=0:return False | |||||
| nm = rmNoise(nm) | |||||
| nm = corpNorm(nm, False) | |||||
| for n in GOOD_CORP: | |||||
| if re.match(r"[0-9a-zA-Z]+$", n): | |||||
| if n == nm: return True | |||||
| elif nm.find(n)>=0:return True | |||||
| return False | |||||
| def corp_tag(nm): | |||||
| global CORP_TAG | |||||
| nm = rmNoise(nm) | |||||
| nm = corpNorm(nm, False) | |||||
| for n in CORP_TAG.keys(): | |||||
| if re.match(r"[0-9a-zA-Z., ]+$", n): | |||||
| if n == nm: return CORP_TAG[n] | |||||
| elif nm.find(n)>=0: | |||||
| if len(n)<3 and len(nm)/len(n)>=2:continue | |||||
| return CORP_TAG[n] | |||||
| return [] | |||||
| TBL = {"94":"EMBA", | |||||
| "6":"MBA", | |||||
| "95":"MPA", | |||||
| "92":"专升本", | |||||
| "4":"专科", | |||||
| "90":"中专", | |||||
| "91":"中技", | |||||
| "86":"初中", | |||||
| "3":"博士", | |||||
| "10":"博士后", | |||||
| "1":"本科", | |||||
| "2":"硕士", | |||||
| "87":"职高", | |||||
| "89":"高中" | |||||
| } | |||||
| TBL_ = {v:k for k,v in TBL.items()} | |||||
| def get_name(id): | |||||
| return TBL.get(str(id), "") | |||||
| def get_id(nm): | |||||
| if not nm:return "" | |||||
| return TBL_.get(nm.upper().strip(), "") |
| TBL = {"1":{"name":"IT/通信/电子","parent":"0"}, | |||||
| "2":{"name":"互联网","parent":"0"}, | |||||
| "3":{"name":"电子商务","parent":"2"}, | |||||
| "4":{"name":"互联网金融","parent":"2"}, | |||||
| "5":{"name":"网络游戏","parent":"2"}, | |||||
| "6":{"name":"社交网络平台","parent":"2"}, | |||||
| "7":{"name":"视频音乐","parent":"2"}, | |||||
| "9":{"name":"安全","parent":"2"}, | |||||
| "10":{"name":"云计算","parent":"2"}, | |||||
| "12":{"name":"工具类客户端应用","parent":"2"}, | |||||
| "13":{"name":"互联网广告","parent":"2"}, | |||||
| "14":{"name":"企业互联网服务","parent":"2"}, | |||||
| "16":{"name":"在线教育","parent":"2"}, | |||||
| "17":{"name":"在线医疗","parent":"2"}, | |||||
| "19":{"name":"B2B","parent":"3"}, | |||||
| "20":{"name":"B2C","parent":"3"}, | |||||
| "21":{"name":"C2C","parent":"3"}, | |||||
| "22":{"name":"生活信息本地化","parent":"3"}, | |||||
| "23":{"name":"在线旅游","parent":"2"}, | |||||
| "24":{"name":"第三方支付","parent":"4"}, | |||||
| "26":{"name":"客户端游戏","parent":"5"}, | |||||
| "27":{"name":"网页游戏","parent":"5"}, | |||||
| "28":{"name":"手机游戏","parent":"5"}, | |||||
| "29":{"name":"微博","parent":"6"}, | |||||
| "30":{"name":"社交网站","parent":"6"}, | |||||
| "31":{"name":"在线视频","parent":"7"}, | |||||
| "32":{"name":"在线音乐","parent":"7"}, | |||||
| "35":{"name":"企业安全","parent":"9"}, | |||||
| "36":{"name":"个人安全","parent":"9"}, | |||||
| "37":{"name":"企业级云服务","parent":"10"}, | |||||
| "38":{"name":"个人级云服务","parent":"10"}, | |||||
| "43":{"name":"输入法","parent":"12"}, | |||||
| "44":{"name":"浏览器","parent":"12"}, | |||||
| "45":{"name":"词典","parent":"12"}, | |||||
| "46":{"name":"播放器","parent":"12"}, | |||||
| "47":{"name":"下载器","parent":"12"}, | |||||
| "48":{"name":"IM","parent":"12"}, | |||||
| "49":{"name":"广告服务","parent":"13"}, | |||||
| "50":{"name":"第三方广告网络平台","parent":"13"}, | |||||
| "51":{"name":"媒体代理","parent":"13"}, | |||||
| "52":{"name":"创意代理","parent":"13"}, | |||||
| "53":{"name":"IT-综合","parent":"1"}, | |||||
| "71":{"name":"团购","parent":"3"}, | |||||
| "72":{"name":"地图","parent":"2"}, | |||||
| "73":{"name":"数据存储","parent":"2"}, | |||||
| "414":{"name":"计算机软件","parent":"1"}, | |||||
| "415":{"name":"计算机硬件","parent":"1"}, | |||||
| "416":{"name":"计算机服务(系统、数据服务、维修)","parent":"1"}, | |||||
| "417":{"name":"通信/电信/网络设备","parent":"1"}, | |||||
| "418":{"name":"通信/电信运营、增值服务","parent":"1"}, | |||||
| "419":{"name":"电子技术/半导体/集成电路","parent":"1"}, | |||||
| "472":{"name":"P2P网贷","parent":"4"}, | |||||
| "473":{"name":"互联网理财","parent":"4"}, | |||||
| "474":{"name":"婚恋","parent":"6"}, | |||||
| "476":{"name":"虚拟化","parent":"10"}, | |||||
| "477":{"name":"邮箱","parent":"12"}, | |||||
| "478":{"name":"商业智能","parent":"14"}, | |||||
| "479":{"name":"企业建站","parent":"14"}, | |||||
| "480":{"name":"安防","parent":"14"}, | |||||
| "481":{"name":"网络营销","parent":"2"}, | |||||
| "487":{"name":"智能终端","parent":"2"}, | |||||
| "488":{"name":"移动互联网","parent":"2"}, | |||||
| "489":{"name":"数字城市","parent":"2"}, | |||||
| "490":{"name":"大数据","parent":"2"}, | |||||
| "491":{"name":"互联网人力资源","parent":"2"}, | |||||
| "492":{"name":"舆情监控","parent":"2"}, | |||||
| "493":{"name":"移动营销","parent":"481"}, | |||||
| "494":{"name":"微博营销","parent":"481"}, | |||||
| "495":{"name":"精准营销","parent":"481"}, | |||||
| "496":{"name":"海外营销","parent":"481"}, | |||||
| "497":{"name":"微信营销","parent":"481"}, | |||||
| "498":{"name":"智能手机","parent":"487"}, | |||||
| "499":{"name":"可穿戴设备","parent":"487"}, | |||||
| "500":{"name":"智能电视","parent":"487"}, | |||||
| "501":{"name":"WAP","parent":"488"}, | |||||
| "502":{"name":"物联网","parent":"489"}, | |||||
| "503":{"name":"O2O","parent":"489"}, | |||||
| "504":{"name":"数字出版","parent":"489"}, | |||||
| "505":{"name":"搜索","parent":"2"}, | |||||
| "506":{"name":"垂直搜索","parent":"505"}, | |||||
| "507":{"name":"无线搜索","parent":"505"}, | |||||
| "508":{"name":"网页搜索","parent":"505"}, | |||||
| "509":{"name":"网址导航","parent":"2"}, | |||||
| "510":{"name":"门户","parent":"2"}, | |||||
| "511":{"name":"网络文学","parent":"2"}, | |||||
| "512":{"name":"自媒体","parent":"2"}, | |||||
| "513":{"name":"金融","parent":"0"}, | |||||
| "514":{"name":"建筑与房地产","parent":"0"}, | |||||
| "515":{"name":"专业服务","parent":"0"}, | |||||
| "516":{"name":"教育培训","parent":"0"}, | |||||
| "517":{"name":"文化传媒","parent":"0"}, | |||||
| "518":{"name":"消费品","parent":"0"}, | |||||
| "519":{"name":"工业","parent":"0"}, | |||||
| "520":{"name":"交通物流","parent":"0"}, | |||||
| "521":{"name":"贸易","parent":"0"}, | |||||
| "522":{"name":"医药","parent":"0"}, | |||||
| "523":{"name":"医疗器械","parent":"522"}, | |||||
| "524":{"name":"保健品","parent":"518"}, | |||||
| "525":{"name":"服务业","parent":"0"}, | |||||
| "526":{"name":"能源/矿产/环保","parent":"0"}, | |||||
| "527":{"name":"化工","parent":"0"}, | |||||
| "528":{"name":"政府","parent":"0"}, | |||||
| "529":{"name":"公共事业","parent":"0"}, | |||||
| "530":{"name":"非盈利机构","parent":"0"}, | |||||
| "531":{"name":"农业","parent":"1131"}, | |||||
| "532":{"name":"林业","parent":"1131"}, | |||||
| "533":{"name":"畜牧业","parent":"1131"}, | |||||
| "534":{"name":"渔业","parent":"1131"}, | |||||
| "535":{"name":"学术科研","parent":"0"}, | |||||
| "536":{"name":"零售","parent":"0"}, | |||||
| "537":{"name":"银行","parent":"513"}, | |||||
| "538":{"name":"保险","parent":"513"}, | |||||
| "539":{"name":"证券","parent":"513"}, | |||||
| "540":{"name":"基金","parent":"513"}, | |||||
| "541":{"name":"信托","parent":"513"}, | |||||
| "542":{"name":"担保","parent":"513"}, | |||||
| "543":{"name":"典当","parent":"513"}, | |||||
| "544":{"name":"拍卖","parent":"513"}, | |||||
| "545":{"name":"投资/融资","parent":"513"}, | |||||
| "546":{"name":"期货","parent":"513"}, | |||||
| "547":{"name":"房地产开发","parent":"514"}, | |||||
| "548":{"name":"工程施工","parent":"514"}, | |||||
| "549":{"name":"建筑设计","parent":"514"}, | |||||
| "550":{"name":"房地产代理","parent":"514"}, | |||||
| "551":{"name":"物业管理","parent":"514"}, | |||||
| "552":{"name":"室内设计","parent":"514"}, | |||||
| "553":{"name":"装修装潢","parent":"514"}, | |||||
| "554":{"name":"市政工程","parent":"514"}, | |||||
| "555":{"name":"工程造价","parent":"514"}, | |||||
| "556":{"name":"工程监理","parent":"514"}, | |||||
| "557":{"name":"环境工程","parent":"514"}, | |||||
| "558":{"name":"园林景观","parent":"514"}, | |||||
| "559":{"name":"法律","parent":"515"}, | |||||
| "560":{"name":"人力资源","parent":"515"}, | |||||
| "561":{"name":"会计","parent":"1125"}, | |||||
| "562":{"name":"审计","parent":"515"}, | |||||
| "563":{"name":"检测认证","parent":"515"}, | |||||
| "565":{"name":"翻译","parent":"515"}, | |||||
| "566":{"name":"中介","parent":"515"}, | |||||
| "567":{"name":"咨询","parent":"515"}, | |||||
| "568":{"name":"外包服务","parent":"515"}, | |||||
| "569":{"name":"家教","parent":"516"}, | |||||
| "570":{"name":"早教","parent":"516"}, | |||||
| "571":{"name":"职业技能培训","parent":"516"}, | |||||
| "572":{"name":"外语培训","parent":"516"}, | |||||
| "573":{"name":"设计培训","parent":"516"}, | |||||
| "574":{"name":"IT培训","parent":"516"}, | |||||
| "575":{"name":"文艺体育培训","parent":"516"}, | |||||
| "576":{"name":"学历教育","parent":"516"}, | |||||
| "577":{"name":"管理培训","parent":"516"}, | |||||
| "578":{"name":"民办基础教育","parent":"516"}, | |||||
| "579":{"name":"广告","parent":"517"}, | |||||
| "580":{"name":"媒体","parent":"517"}, | |||||
| "581":{"name":"会展","parent":"517"}, | |||||
| "582":{"name":"公关","parent":"517"}, | |||||
| "583":{"name":"影视","parent":"517"}, | |||||
| "584":{"name":"艺术","parent":"517"}, | |||||
| "585":{"name":"文化传播","parent":"517"}, | |||||
| "586":{"name":"娱乐","parent":"517"}, | |||||
| "587":{"name":"体育","parent":"517"}, | |||||
| "588":{"name":"出版","parent":"517"}, | |||||
| "589":{"name":"休闲","parent":"517"}, | |||||
| "590":{"name":"动漫","parent":"517"}, | |||||
| "591":{"name":"市场推广","parent":"517"}, | |||||
| "592":{"name":"市场研究","parent":"517"}, | |||||
| "593":{"name":"食品","parent":"1129"}, | |||||
| "594":{"name":"饮料","parent":"1129"}, | |||||
| "595":{"name":"烟草","parent":"1129"}, | |||||
| "596":{"name":"酒品","parent":"518"}, | |||||
| "597":{"name":"服饰","parent":"518"}, | |||||
| "598":{"name":"纺织","parent":"518"}, | |||||
| "599":{"name":"化妆品","parent":"1129"}, | |||||
| "600":{"name":"日用品","parent":"1129"}, | |||||
| "601":{"name":"家电","parent":"518"}, | |||||
| "602":{"name":"家具","parent":"518"}, | |||||
| "603":{"name":"办公用品","parent":"518"}, | |||||
| "604":{"name":"奢侈品","parent":"518"}, | |||||
| "605":{"name":"珠宝","parent":"518"}, | |||||
| "606":{"name":"数码产品","parent":"518"}, | |||||
| "607":{"name":"玩具","parent":"518"}, | |||||
| "608":{"name":"图书","parent":"518"}, | |||||
| "609":{"name":"音像","parent":"518"}, | |||||
| "610":{"name":"钟表","parent":"518"}, | |||||
| "611":{"name":"箱包","parent":"518"}, | |||||
| "612":{"name":"母婴","parent":"518"}, | |||||
| "613":{"name":"营养保健","parent":"518"}, | |||||
| "614":{"name":"户外用品","parent":"518"}, | |||||
| "615":{"name":"健身器材","parent":"518"}, | |||||
| "616":{"name":"乐器","parent":"518"}, | |||||
| "617":{"name":"汽车用品","parent":"518"}, | |||||
| "619":{"name":"厨具","parent":"518"}, | |||||
| "620":{"name":"机械制造","parent":"519"}, | |||||
| "621":{"name":"流体控制","parent":"519"}, | |||||
| "622":{"name":"自动化控制","parent":"519"}, | |||||
| "623":{"name":"仪器仪表","parent":"519"}, | |||||
| "624":{"name":"航空/航天","parent":"519"}, | |||||
| "625":{"name":"交通设施","parent":"519"}, | |||||
| "626":{"name":"工业电子","parent":"519"}, | |||||
| "627":{"name":"建材","parent":"519"}, | |||||
| "628":{"name":"五金材料","parent":"519"}, | |||||
| "629":{"name":"汽车","parent":"519"}, | |||||
| "630":{"name":"印刷","parent":"519"}, | |||||
| "631":{"name":"造纸","parent":"519"}, | |||||
| "632":{"name":"包装","parent":"519"}, | |||||
| "633":{"name":"原材料及加工","parent":"519"}, | |||||
| "634":{"name":"物流","parent":"520"}, | |||||
| "635":{"name":"仓储","parent":"520"}, | |||||
| "636":{"name":"客运","parent":"520"}, | |||||
| "637":{"name":"快递","parent":"520"}, | |||||
| "638":{"name":"化学药","parent":"522"}, | |||||
| "639":{"name":"中药","parent":"522"}, | |||||
| "640":{"name":"生物制药","parent":"522"}, | |||||
| "641":{"name":"兽药","parent":"522"}, | |||||
| "642":{"name":"农药","parent":"522"}, | |||||
| "643":{"name":"CRO","parent":"522"}, | |||||
| "644":{"name":"消毒","parent":"522"}, | |||||
| "645":{"name":"医药商业","parent":"522"}, | |||||
| "646":{"name":"医疗服务","parent":"522"}, | |||||
| "647":{"name":"医疗器械","parent":"523"}, | |||||
| "648":{"name":"制药设备","parent":"523"}, | |||||
| "649":{"name":"医用耗材","parent":"523"}, | |||||
| "650":{"name":"手术器械","parent":"523"}, | |||||
| "651":{"name":"保健器材","parent":"524"}, | |||||
| "652":{"name":"性保健品","parent":"524"}, | |||||
| "653":{"name":"医药保养","parent":"524"}, | |||||
| "654":{"name":"医用保健","parent":"524"}, | |||||
| "655":{"name":"酒店","parent":"525"}, | |||||
| "656":{"name":"餐饮","parent":"525"}, | |||||
| "657":{"name":"旅游","parent":"525"}, | |||||
| "658":{"name":"生活服务","parent":"525"}, | |||||
| "659":{"name":"保健服务","parent":"525"}, | |||||
| "660":{"name":"运动健身","parent":"525"}, | |||||
| "661":{"name":"家政服务","parent":"525"}, | |||||
| "662":{"name":"婚庆服务","parent":"525"}, | |||||
| "663":{"name":"租赁服务","parent":"525"}, | |||||
| "664":{"name":"维修服务","parent":"525"}, | |||||
| "665":{"name":"石油天然气","parent":"526"}, | |||||
| "666":{"name":"电力","parent":"526"}, | |||||
| "667":{"name":"新能源","parent":"526"}, | |||||
| "668":{"name":"水利","parent":"526"}, | |||||
| "669":{"name":"矿产","parent":"526"}, | |||||
| "670":{"name":"采掘业","parent":"526"}, | |||||
| "671":{"name":"冶炼","parent":"526"}, | |||||
| "672":{"name":"环保","parent":"526"}, | |||||
| "673":{"name":"无机化工原料","parent":"527"}, | |||||
| "674":{"name":"有机化工原料","parent":"527"}, | |||||
| "675":{"name":"精细化学品","parent":"527"}, | |||||
| "676":{"name":"化工设备","parent":"527"}, | |||||
| "677":{"name":"化工工程","parent":"527"}, | |||||
| "678":{"name":"资产管理","parent":"513"}, | |||||
| "679":{"name":"金融租赁","parent":"513"}, | |||||
| "680":{"name":"征信及信评机构","parent":"513"}, | |||||
| "681":{"name":"资产评估机构","parent":"513"}, | |||||
| "683":{"name":"金融监管机构","parent":"513"}, | |||||
| "684":{"name":"国际贸易","parent":"521"}, | |||||
| "685":{"name":"海关","parent":"521"}, | |||||
| "686":{"name":"购物中心","parent":"536"}, | |||||
| "687":{"name":"超市","parent":"536"}, | |||||
| "688":{"name":"便利店","parent":"536"}, | |||||
| "689":{"name":"专卖店","parent":"536"}, | |||||
| "690":{"name":"专业店","parent":"536"}, | |||||
| "691":{"name":"百货店","parent":"536"}, | |||||
| "692":{"name":"杂货店","parent":"536"}, | |||||
| "693":{"name":"个人银行","parent":"537"}, | |||||
| "695":{"name":"私人银行","parent":"537"}, | |||||
| "696":{"name":"公司银行","parent":"537"}, | |||||
| "697":{"name":"投资银行","parent":"537"}, | |||||
| "698":{"name":"政策性银行","parent":"537"}, | |||||
| "699":{"name":"中央银行","parent":"537"}, | |||||
| "700":{"name":"人寿险","parent":"538"}, | |||||
| "701":{"name":"财产险","parent":"538"}, | |||||
| "702":{"name":"再保险","parent":"538"}, | |||||
| "703":{"name":"养老险","parent":"538"}, | |||||
| "704":{"name":"保险代理公司","parent":"538"}, | |||||
| "705":{"name":"公募基金","parent":"540"}, | |||||
| "707":{"name":"私募基金","parent":"540"}, | |||||
| "708":{"name":"第三方理财","parent":"679"}, | |||||
| "709":{"name":"资产管理公司","parent":"679"}, | |||||
| "711":{"name":"房产中介","parent":"566"}, | |||||
| "712":{"name":"职业中介","parent":"566"}, | |||||
| "713":{"name":"婚姻中介","parent":"566"}, | |||||
| "714":{"name":"战略咨询","parent":"567"}, | |||||
| "715":{"name":"投资咨询","parent":"567"}, | |||||
| "716":{"name":"心理咨询","parent":"567"}, | |||||
| "717":{"name":"留学移民咨询","parent":"567"}, | |||||
| "718":{"name":"工商注册代理","parent":"568"}, | |||||
| "719":{"name":"商标专利代理","parent":"568"}, | |||||
| "720":{"name":"财务代理","parent":"568"}, | |||||
| "721":{"name":"工程机械","parent":"620"}, | |||||
| "722":{"name":"农业机械","parent":"620"}, | |||||
| "723":{"name":"海工设备","parent":"620"}, | |||||
| "724":{"name":"包装机械","parent":"620"}, | |||||
| "725":{"name":"印刷机械","parent":"620"}, | |||||
| "726":{"name":"数控机床","parent":"620"}, | |||||
| "727":{"name":"矿山机械","parent":"620"}, | |||||
| "728":{"name":"水泵","parent":"621"}, | |||||
| "729":{"name":"管道","parent":"621"}, | |||||
| "730":{"name":"阀门","parent":"621"}, | |||||
| "732":{"name":"压缩机","parent":"621"}, | |||||
| "733":{"name":"集散控制系统","parent":"622"}, | |||||
| "734":{"name":"远程控制","parent":"622"}, | |||||
| "735":{"name":"液压系统","parent":"622"}, | |||||
| "736":{"name":"楼宇智能化","parent":"622"}, | |||||
| "737":{"name":"飞机制造","parent":"624"}, | |||||
| "738":{"name":"航空公司","parent":"624"}, | |||||
| "739":{"name":"发动机","parent":"624"}, | |||||
| "740":{"name":"复合材料","parent":"624"}, | |||||
| "741":{"name":"高铁","parent":"625"}, | |||||
| "742":{"name":"地铁","parent":"625"}, | |||||
| "743":{"name":"信号传输","parent":"625"}, | |||||
| "745":{"name":"结构材料","parent":"627"}, | |||||
| "746":{"name":"装饰材料","parent":"627"}, | |||||
| "747":{"name":"专用材料","parent":"627"}, | |||||
| "749":{"name":"经销商集团","parent":"629"}, | |||||
| "750":{"name":"整车制造","parent":"629"}, | |||||
| "751":{"name":"汽车零配件","parent":"629"}, | |||||
| "752":{"name":"外型设计","parent":"629"}, | |||||
| "753":{"name":"平版印刷","parent":"630"}, | |||||
| "754":{"name":"凸版印刷","parent":"630"}, | |||||
| "755":{"name":"凹版印刷","parent":"630"}, | |||||
| "756":{"name":"孔版印刷","parent":"630"}, | |||||
| "757":{"name":"印刷用纸","parent":"631"}, | |||||
| "758":{"name":"书写、制图及复制用纸","parent":"631"}, | |||||
| "759":{"name":"包装用纸","parent":"631"}, | |||||
| "760":{"name":"生活、卫生及装饰用纸","parent":"631"}, | |||||
| "761":{"name":"技术用纸","parent":"631"}, | |||||
| "762":{"name":"加工纸原纸","parent":"631"}, | |||||
| "763":{"name":"食品包装","parent":"632"}, | |||||
| "764":{"name":"医药包装","parent":"632"}, | |||||
| "765":{"name":"日化包装","parent":"632"}, | |||||
| "766":{"name":"物流包装","parent":"632"}, | |||||
| "767":{"name":"礼品包装","parent":"632"}, | |||||
| "768":{"name":"电子五金包装","parent":"632"}, | |||||
| "769":{"name":"汽车服务","parent":"525"}, | |||||
| "770":{"name":"汽车保养","parent":"769"}, | |||||
| "771":{"name":"租车","parent":"769"}, | |||||
| "773":{"name":"出租车","parent":"769"}, | |||||
| "774":{"name":"代驾","parent":"769"}, | |||||
| "775":{"name":"发电","parent":"666"}, | |||||
| "777":{"name":"输配电","parent":"666"}, | |||||
| "779":{"name":"风电","parent":"667"}, | |||||
| "780":{"name":"光伏/太阳能","parent":"667"}, | |||||
| "781":{"name":"生物质发电","parent":"667"}, | |||||
| "782":{"name":"煤化工","parent":"667"}, | |||||
| "783":{"name":"垃圾发电","parent":"667"}, | |||||
| "784":{"name":"核电","parent":"667"}, | |||||
| "785":{"name":"能源矿产","parent":"669"}, | |||||
| "786":{"name":"金属矿产","parent":"669"}, | |||||
| "787":{"name":"非金属矿产","parent":"669"}, | |||||
| "788":{"name":"水气矿产","parent":"669"}, | |||||
| "789":{"name":"锅炉","parent":"775"}, | |||||
| "790":{"name":"发电机","parent":"775"}, | |||||
| "791":{"name":"汽轮机","parent":"775"}, | |||||
| "792":{"name":"燃机","parent":"775"}, | |||||
| "793":{"name":"冷却","parent":"775"}, | |||||
| "794":{"name":"电力设计院","parent":"775"}, | |||||
| "795":{"name":"高压输配电","parent":"777"}, | |||||
| "796":{"name":"中压输配电","parent":"777"}, | |||||
| "797":{"name":"低压输配电","parent":"777"}, | |||||
| "798":{"name":"继电保护","parent":"777"}, | |||||
| "799":{"name":"智能电网","parent":"777"}, | |||||
| "800":{"name":"小学","parent":"516"}, | |||||
| "801":{"name":"电动车","parent":"519"}, | |||||
| "802":{"name":"皮具箱包","parent":"518"}, | |||||
| "803":{"name":"医药制造","parent":"522"}, | |||||
| "804":{"name":"电器销售","parent":"536"}, | |||||
| "805":{"name":"塑料制品","parent":"527"}, | |||||
| "806":{"name":"公益基金会","parent":"530"}, | |||||
| "807":{"name":"美发服务","parent":"525"}, | |||||
| "808":{"name":"农业养殖","parent":"531"}, | |||||
| "809":{"name":"金融服务","parent":"513"}, | |||||
| "810":{"name":"商业地产综合体","parent":"514"}, | |||||
| "811":{"name":"美容服务","parent":"525"}, | |||||
| "812":{"name":"灯饰","parent":"518"}, | |||||
| "813":{"name":"油墨颜料产品","parent":"527"}, | |||||
| "814":{"name":"眼镜制造","parent":"518"}, | |||||
| "815":{"name":"农业生物技术","parent":"531"}, | |||||
| "816":{"name":"体育用品","parent":"518"}, | |||||
| "817":{"name":"保健用品","parent":"524"}, | |||||
| "818":{"name":"化学化工产品","parent":"527"}, | |||||
| "819":{"name":"饲料","parent":"531"}, | |||||
| "821":{"name":"保安服务","parent":"525"}, | |||||
| "822":{"name":"干细胞技术","parent":"522"}, | |||||
| "824":{"name":"农药化肥","parent":"527"}, | |||||
| "825":{"name":"卫生洁具","parent":"518"}, | |||||
| "826":{"name":"体育器材、场馆","parent":"518"}, | |||||
| "827":{"name":"饲料加工","parent":"531"}, | |||||
| "828":{"name":"测绘服务","parent":"529"}, | |||||
| "830":{"name":"金属船舶制造","parent":"519"}, | |||||
| "831":{"name":"基因工程","parent":"522"}, | |||||
| "832":{"name":"花卉服务","parent":"536"}, | |||||
| "833":{"name":"农业种植","parent":"531"}, | |||||
| "834":{"name":"皮革制品","parent":"518"}, | |||||
| "835":{"name":"地理信息加工服务","parent":"529"}, | |||||
| "836":{"name":"机器人","parent":"519"}, | |||||
| "837":{"name":"礼品","parent":"518"}, | |||||
| "838":{"name":"理发及美容服务","parent":"525"}, | |||||
| "839":{"name":"其他清洁服务","parent":"525"}, | |||||
| "840":{"name":"硅胶材料","parent":"527"}, | |||||
| "841":{"name":"茶叶销售","parent":"518"}, | |||||
| "842":{"name":"彩票活动","parent":"529"}, | |||||
| "843":{"name":"化妆培训","parent":"516"}, | |||||
| "844":{"name":"鞋业","parent":"518"}, | |||||
| "845":{"name":"酒店用品","parent":"518"}, | |||||
| "846":{"name":"复合材料","parent":"527"}, | |||||
| "847":{"name":"房地产工程建设","parent":"548"}, | |||||
| "848":{"name":"知识产权服务","parent":"559"}, | |||||
| "849":{"name":"新型建材","parent":"627"}, | |||||
| "850":{"name":"企业投资咨询","parent":"567"}, | |||||
| "851":{"name":"含乳饮料和植物蛋白饮料制造","parent":"594"}, | |||||
| "852":{"name":"汽车检测设备","parent":"629"}, | |||||
| "853":{"name":"手机通讯器材","parent":"417"}, | |||||
| "854":{"name":"环保材料","parent":"672"}, | |||||
| "855":{"name":"交通设施","parent":"554"}, | |||||
| "856":{"name":"电子器件","parent":"419"}, | |||||
| "857":{"name":"啤酒","parent":"594"}, | |||||
| "858":{"name":"生态旅游","parent":"657"}, | |||||
| "859":{"name":"自动化设备","parent":"626"}, | |||||
| "860":{"name":"软件开发","parent":"414"}, | |||||
| "861":{"name":"葡萄酒销售","parent":"594"}, | |||||
| "862":{"name":"钢材","parent":"633"}, | |||||
| "863":{"name":"餐饮培训","parent":"656"}, | |||||
| "864":{"name":"速冻食品","parent":"593"}, | |||||
| "865":{"name":"空气环保","parent":"672"}, | |||||
| "866":{"name":"互联网房地产经纪服务","parent":"550"}, | |||||
| "867":{"name":"食品添加剂","parent":"593"}, | |||||
| "868":{"name":"演艺传播","parent":"585"}, | |||||
| "869":{"name":"信用卡","parent":"537"}, | |||||
| "870":{"name":"报纸期刊广告","parent":"579"}, | |||||
| "871":{"name":"摄影","parent":"525"}, | |||||
| "872":{"name":"手机软件","parent":"414"}, | |||||
| "873":{"name":"地坪建材","parent":"627"}, | |||||
| "874":{"name":"企业管理咨询","parent":"567"}, | |||||
| "875":{"name":"幼儿教育","parent":"570"}, | |||||
| "876":{"name":"系统集成","parent":"416"}, | |||||
| "877":{"name":"皮革服饰","parent":"597"}, | |||||
| "878":{"name":"保健食品","parent":"593"}, | |||||
| "879":{"name":"叉车","parent":"620"}, | |||||
| "880":{"name":"厨卫电器","parent":"601"}, | |||||
| "882":{"name":"地暖设备","parent":"627"}, | |||||
| "883":{"name":"钢结构制造","parent":"548"}, | |||||
| "884":{"name":"投影机","parent":"606"}, | |||||
| "885":{"name":"啤酒销售","parent":"594"}, | |||||
| "886":{"name":"度假村旅游","parent":"657"}, | |||||
| "887":{"name":"电力元件设备","parent":"626"}, | |||||
| "888":{"name":"管理软件","parent":"414"}, | |||||
| "889":{"name":"轴承","parent":"628"}, | |||||
| "890":{"name":"餐饮设备","parent":"656"}, | |||||
| "891":{"name":"肉制品及副产品加工","parent":"593"}, | |||||
| "892":{"name":"艺术收藏品投资交易","parent":"584"}, | |||||
| "893":{"name":"净水器","parent":"601"}, | |||||
| "894":{"name":"进口食品","parent":"593"}, | |||||
| "895":{"name":"娱乐文化传播","parent":"585"}, | |||||
| "896":{"name":"文化传播","parent":"585"}, | |||||
| "897":{"name":"商旅传媒","parent":"580"}, | |||||
| "898":{"name":"广告设计制作","parent":"579"}, | |||||
| "899":{"name":"金属丝绳及其制品制造","parent":"627"}, | |||||
| "900":{"name":"建筑涂料","parent":"627"}, | |||||
| "901":{"name":"抵押贷款","parent":"543"}, | |||||
| "902":{"name":"早教","parent":"570"}, | |||||
| "903":{"name":"电影放映","parent":"583"}, | |||||
| "904":{"name":"内衣服饰","parent":"597"}, | |||||
| "905":{"name":"无线网络通信","parent":"418"}, | |||||
| "906":{"name":"记忆卡","parent":"415"}, | |||||
| "907":{"name":"女装服饰","parent":"597"}, | |||||
| "908":{"name":"建筑机械","parent":"620"}, | |||||
| "909":{"name":"制冷电器","parent":"601"}, | |||||
| "910":{"name":"通信设备","parent":"417"}, | |||||
| "911":{"name":"空调设备","parent":"601"}, | |||||
| "912":{"name":"建筑装饰","parent":"553"}, | |||||
| "913":{"name":"办公设备","parent":"603"}, | |||||
| "916":{"name":"数据处理软件","parent":"414"}, | |||||
| "917":{"name":"葡萄酒贸易","parent":"594"}, | |||||
| "918":{"name":"通讯器材","parent":"417"}, | |||||
| "919":{"name":"铜业","parent":"633"}, | |||||
| "920":{"name":"食堂","parent":"656"}, | |||||
| "921":{"name":"糖果零食","parent":"593"}, | |||||
| "922":{"name":"文化艺术传播","parent":"584"}, | |||||
| "923":{"name":"太阳能电器","parent":"601"}, | |||||
| "924":{"name":"药品零售","parent":"645"}, | |||||
| "925":{"name":"果蔬食品","parent":"593"}, | |||||
| "926":{"name":"文化活动策划","parent":"585"}, | |||||
| "928":{"name":"汽车广告","parent":"657"}, | |||||
| "929":{"name":"条码设备","parent":"630"}, | |||||
| "930":{"name":"建筑石材","parent":"627"}, | |||||
| "931":{"name":"贵金属","parent":"545"}, | |||||
| "932":{"name":"体育","parent":"660"}, | |||||
| "933":{"name":"金融信息服务","parent":"414"}, | |||||
| "934":{"name":"玻璃建材","parent":"627"}, | |||||
| "935":{"name":"家教","parent":"569"}, | |||||
| "936":{"name":"歌舞厅娱乐活动","parent":"586"}, | |||||
| "937":{"name":"计算机服务器","parent":"415"}, | |||||
| "938":{"name":"管道","parent":"627"}, | |||||
| "939":{"name":"婴幼儿服饰","parent":"597"}, | |||||
| "940":{"name":"热水器","parent":"601"}, | |||||
| "941":{"name":"计算机及零部件制造","parent":"415"}, | |||||
| "942":{"name":"钢铁贸易","parent":"633"}, | |||||
| "944":{"name":"包装材料","parent":"632"}, | |||||
| "945":{"name":"计算机办公设备","parent":"603"}, | |||||
| "946":{"name":"白酒","parent":"594"}, | |||||
| "948":{"name":"发动机","parent":"620"}, | |||||
| "949":{"name":"快餐服务","parent":"656"}, | |||||
| "950":{"name":"酒类销售","parent":"594"}, | |||||
| "951":{"name":"电子产品、机电设备","parent":"626"}, | |||||
| "952":{"name":"激光设备","parent":"626"}, | |||||
| "953":{"name":"餐饮策划","parent":"656"}, | |||||
| "954":{"name":"饮料、食品","parent":"594"}, | |||||
| "955":{"name":"文化娱乐经纪","parent":"585"}, | |||||
| "956":{"name":"天然气","parent":"665"}, | |||||
| "957":{"name":"农副食品","parent":"593"}, | |||||
| "958":{"name":"艺术表演","parent":"585"}, | |||||
| "959":{"name":"石膏、水泥制品及类似制品制造","parent":"627"}, | |||||
| "960":{"name":"橱柜","parent":"602"}, | |||||
| "961":{"name":"管理培训","parent":"577"}, | |||||
| "962":{"name":"男装服饰","parent":"597"}, | |||||
| "963":{"name":"化肥制造","parent":"675"}, | |||||
| "964":{"name":"童装服饰","parent":"597"}, | |||||
| "965":{"name":"电源电池","parent":"626"}, | |||||
| "966":{"name":"家电维修","parent":"664"}, | |||||
| "967":{"name":"光电子器件","parent":"419"}, | |||||
| "968":{"name":"旅行社服务","parent":"657"}, | |||||
| "969":{"name":"电线、电缆制造","parent":"626"}, | |||||
| "970":{"name":"软件开发、信息系统集成","parent":"419"}, | |||||
| "971":{"name":"白酒制造","parent":"594"}, | |||||
| "973":{"name":"甜品服务","parent":"656"}, | |||||
| "974":{"name":"糕点、面包制造","parent":"593"}, | |||||
| "975":{"name":"木工机械","parent":"620"}, | |||||
| "976":{"name":"酒吧服务","parent":"656"}, | |||||
| "977":{"name":"火腿肠","parent":"593"}, | |||||
| "978":{"name":"广告策划推广","parent":"579"}, | |||||
| "979":{"name":"新能源产品和生产装备制造","parent":"667"}, | |||||
| "980":{"name":"调味品","parent":"593"}, | |||||
| "981":{"name":"礼仪表演","parent":"585"}, | |||||
| "982":{"name":"劳务派遣","parent":"560"}, | |||||
| "983":{"name":"建材零售","parent":"627"}, | |||||
| "984":{"name":"商品交易中心","parent":"545"}, | |||||
| "985":{"name":"体育推广","parent":"585"}, | |||||
| "986":{"name":"茶饮料及其他饮料制造","parent":"594"}, | |||||
| "987":{"name":"金属建材","parent":"627"}, | |||||
| "988":{"name":"职业技能培训","parent":"571"}, | |||||
| "989":{"name":"网吧活动","parent":"586"}, | |||||
| "990":{"name":"洗衣服务","parent":"658"}, | |||||
| "991":{"name":"管道工程","parent":"554"}, | |||||
| "992":{"name":"通信工程","parent":"417"}, | |||||
| "993":{"name":"电子元器件","parent":"626"}, | |||||
| "994":{"name":"电子设备","parent":"419"}, | |||||
| "995":{"name":"茶馆服务","parent":"656"}, | |||||
| "996":{"name":"旅游开发","parent":"657"}, | |||||
| "997":{"name":"视频通讯","parent":"417"}, | |||||
| "998":{"name":"白酒销售","parent":"594"}, | |||||
| "1000":{"name":"咖啡馆服务","parent":"656"}, | |||||
| "1001":{"name":"食品零售","parent":"593"}, | |||||
| "1002":{"name":"健康疗养旅游","parent":"655"}, | |||||
| "1003":{"name":"粮油食品","parent":"593"}, | |||||
| "1004":{"name":"儿童教育影视","parent":"583"}, | |||||
| "1005":{"name":"新能源发电","parent":"667"}, | |||||
| "1006":{"name":"旅游策划","parent":"657"}, | |||||
| "1007":{"name":"绘画","parent":"575"}, | |||||
| "1008":{"name":"方便面及其他方便食品","parent":"593"}, | |||||
| "1009":{"name":"房地产经纪","parent":"550"}, | |||||
| "1010":{"name":"母婴家政","parent":"661"}, | |||||
| "1011":{"name":"居家养老健康服务","parent":"661"}, | |||||
| "1012":{"name":"文化艺术投资","parent":"545"}, | |||||
| "1013":{"name":"运动健身","parent":"660"}, | |||||
| "1014":{"name":"瓶(罐)装饮用水制造","parent":"594"}, | |||||
| "1015":{"name":"金属门窗","parent":"627"}, | |||||
| "1016":{"name":"机动车检测","parent":"563"}, | |||||
| "1017":{"name":"货物运输","parent":"634"}, | |||||
| "1018":{"name":"服饰专卖","parent":"690"}, | |||||
| "1019":{"name":"酒店服装","parent":"597"}, | |||||
| "1020":{"name":"通讯软件","parent":"417"}, | |||||
| "1021":{"name":"消防工程","parent":"554"}, | |||||
| "1022":{"name":"嵌入式电子系统","parent":"419"}, | |||||
| "1023":{"name":"航空票务","parent":"636"}, | |||||
| "1024":{"name":"电气设备","parent":"626"}, | |||||
| "1025":{"name":"酒业贸易","parent":"594"}, | |||||
| "1027":{"name":"其他饮料及冷饮服务","parent":"656"}, | |||||
| "1028":{"name":"乳制品","parent":"593"}, | |||||
| "1029":{"name":"新闻期刊出版","parent":"588"}, | |||||
| "1030":{"name":"水污染治理","parent":"672"}, | |||||
| "1031":{"name":"谷物食品","parent":"593"}, | |||||
| "1032":{"name":"数字动漫设计制造服务","parent":"590"}, | |||||
| "1033":{"name":"医院","parent":"646"}, | |||||
| "1034":{"name":"旅游广告","parent":"657"}, | |||||
| "1035":{"name":"办公家具","parent":"602"}, | |||||
| "1036":{"name":"房地产营销策划","parent":"550"}, | |||||
| "1037":{"name":"保洁家政","parent":"661"}, | |||||
| "1038":{"name":"水泥制造","parent":"627"}, | |||||
| "1039":{"name":"市场研究咨询","parent":"567"}, | |||||
| "1040":{"name":"驾校","parent":"571"}, | |||||
| "1041":{"name":"正餐服务","parent":"656"}, | |||||
| "1043":{"name":"机动车燃油","parent":"665"}, | |||||
| "1044":{"name":"食品","parent":"593"}, | |||||
| "1045":{"name":"新能源汽车","parent":"629"}, | |||||
| "1046":{"name":"手机无线网络推广","parent":"417"}, | |||||
| "1047":{"name":"环保设备","parent":"672"}, | |||||
| "1048":{"name":"通讯工程","parent":"418"}, | |||||
| "1049":{"name":"半导体集成电路","parent":"419"}, | |||||
| "1050":{"name":"航空服务","parent":"636"}, | |||||
| "1051":{"name":"电机设备","parent":"626"}, | |||||
| "1052":{"name":"档案软件","parent":"414"}, | |||||
| "1053":{"name":"冷链物流服务","parent":"634"}, | |||||
| "1054":{"name":"小吃服务","parent":"656"}, | |||||
| "1055":{"name":"水产品加工","parent":"593"}, | |||||
| "1056":{"name":"图书出版","parent":"588"}, | |||||
| "1057":{"name":"固体废物治理","parent":"672"}, | |||||
| "1059":{"name":"坚果食品","parent":"593"}, | |||||
| "1060":{"name":"广告传媒","parent":"579"}, | |||||
| "1061":{"name":"电梯","parent":"622"}, | |||||
| "1062":{"name":"社区医疗与卫生院","parent":"646"}, | |||||
| "1063":{"name":"广告、印刷包装","parent":"630"}, | |||||
| "1064":{"name":"婚纱礼服","parent":"662"}, | |||||
| "1065":{"name":"地毯","parent":"602"}, | |||||
| "1066":{"name":"互联网物业","parent":"551"}, | |||||
| "1067":{"name":"跨境电商","parent":"3"}, | |||||
| "1068":{"name":"信息安全、系统集成","parent":"9"}, | |||||
| "1069":{"name":"专用汽车制造","parent":"750"}, | |||||
| "1070":{"name":"商品贸易","parent":"3"}, | |||||
| "1071":{"name":"墙壁装饰材料","parent":"746"}, | |||||
| "1072":{"name":"窗帘装饰材料","parent":"746"}, | |||||
| "1073":{"name":"电子商务、本地生活服务","parent":"3"}, | |||||
| "1075":{"name":"白酒电子商务","parent":"3"}, | |||||
| "1076":{"name":"商品贸易、电子商务","parent":"3"}, | |||||
| "1077":{"name":"木质装饰材料","parent":"746"}, | |||||
| "1078":{"name":"电子商务、汽车电商交易平台","parent":"3"}, | |||||
| "1079":{"name":"汽车轮胎","parent":"751"}, | |||||
| "1080":{"name":"气体压缩机械制造","parent":"732"}, | |||||
| "1081":{"name":"家装家具电子商务","parent":"3"}, | |||||
| "1082":{"name":"化妆品电子商务","parent":"3"}, | |||||
| "1083":{"name":"汽车销售","parent":"749"}, | |||||
| "1084":{"name":"新闻资讯网站","parent":"510"}, | |||||
| "1085":{"name":"母婴电商","parent":"3"}, | |||||
| "1086":{"name":"电商商务、收藏品交易","parent":"3"}, | |||||
| "1088":{"name":"电子商务、数码产品","parent":"3"}, | |||||
| "1089":{"name":"二手车交易","parent":"749"}, | |||||
| "1090":{"name":"游戏制作服务","parent":"5"}, | |||||
| "1091":{"name":"母婴服务","parent":"510"}, | |||||
| "1092":{"name":"家具电子商务","parent":"3"}, | |||||
| "1093":{"name":"汽车配件电子商务","parent":"3"}, | |||||
| "1094":{"name":"输配电设备","parent":"777"}, | |||||
| "1095":{"name":"矿山设备","parent":"727"}, | |||||
| "1096":{"name":"机床机械","parent":"726"}, | |||||
| "1097":{"name":"农产品电商","parent":"3"}, | |||||
| "1098":{"name":"陶瓷装饰材料","parent":"746"}, | |||||
| "1099":{"name":"车载联网设备","parent":"487"}, | |||||
| "1100":{"name":"汽车销售电子商务","parent":"3"}, | |||||
| "1101":{"name":"石油设备","parent":"730"}, | |||||
| "1102":{"name":"智能家居","parent":"487"}, | |||||
| "1103":{"name":"散热器","parent":"751"}, | |||||
| "1104":{"name":"电力工程","parent":"775"}, | |||||
| "1105":{"name":"生鲜电商","parent":"3"}, | |||||
| "1106":{"name":"互联网数据服务","parent":"490"}, | |||||
| "1107":{"name":"房车、商务车销售","parent":"749"}, | |||||
| "1108":{"name":"茶叶电子商务","parent":"3"}, | |||||
| "1109":{"name":"酒类电子商务","parent":"3"}, | |||||
| "1110":{"name":"阀门","parent":"730"}, | |||||
| "1111":{"name":"食品电商","parent":"3"}, | |||||
| "1112":{"name":"儿童摄影","parent":"871"}, | |||||
| "1113":{"name":"广告摄影","parent":"871"}, | |||||
| "1114":{"name":"婚纱摄影","parent":"871"}, | |||||
| "1115":{"name":"模具制造","parent":"620"}, | |||||
| "1116":{"name":"汽车模具","parent":"629"}, | |||||
| "1117":{"name":"认证咨询","parent":"567"}, | |||||
| "1118":{"name":"数字视觉制作服务","parent":"590"}, | |||||
| "1119":{"name":"牙科及医疗器械","parent":"646"}, | |||||
| "1120":{"name":"猎头招聘","parent":"560"}, | |||||
| "1121":{"name":"家居","parent":"518"}, | |||||
| "1122":{"name":"收藏品","parent":"518"}, | |||||
| "1123":{"name":"首饰","parent":"518"}, | |||||
| "1124":{"name":"工艺品","parent":"518"}, | |||||
| "1125":{"name":"财务","parent":"515"}, | |||||
| "1126":{"name":"税务","parent":"515"}, | |||||
| "1127":{"name":"分类信息","parent":"2"}, | |||||
| "1128":{"name":"宠物","parent":"0"}, | |||||
| "1129":{"name":"快消品","parent":"518"}, | |||||
| "1130":{"name":"人工智能","parent":"2"}, | |||||
| "1131":{"name":"农/林/牧/渔","parent":"0"} | |||||
| } | |||||
| def get_names(id): | |||||
| id = str(id) | |||||
| nms = [] | |||||
| d = TBL.get(id) | |||||
| if not d:return [] | |||||
| nms.append(d["name"]) | |||||
| p = get_names(d["parent"]) | |||||
| if p: nms.extend(p) | |||||
| return nms | |||||
| if __name__ == "__main__": | |||||
| print(get_names("1119")) |
| TBL = { | |||||
| "2":{"name":"北京","parent":"1"}, | |||||
| "3":{"name":"天津","parent":"1"}, | |||||
| "4":{"name":"河北","parent":"1"}, | |||||
| "5":{"name":"山西","parent":"1"}, | |||||
| "6":{"name":"内蒙古","parent":"1"}, | |||||
| "7":{"name":"辽宁","parent":"1"}, | |||||
| "8":{"name":"吉林","parent":"1"}, | |||||
| "9":{"name":"黑龙江","parent":"1"}, | |||||
| "10":{"name":"上海","parent":"1"}, | |||||
| "11":{"name":"江苏","parent":"1"}, | |||||
| "12":{"name":"浙江","parent":"1"}, | |||||
| "13":{"name":"安徽","parent":"1"}, | |||||
| "14":{"name":"福建","parent":"1"}, | |||||
| "15":{"name":"江西","parent":"1"}, | |||||
| "16":{"name":"山东","parent":"1"}, | |||||
| "17":{"name":"河南","parent":"1"}, | |||||
| "18":{"name":"湖北","parent":"1"}, | |||||
| "19":{"name":"湖南","parent":"1"}, | |||||
| "20":{"name":"广东","parent":"1"}, | |||||
| "21":{"name":"广西","parent":"1"}, | |||||
| "22":{"name":"海南","parent":"1"}, | |||||
| "23":{"name":"重庆","parent":"1"}, | |||||
| "24":{"name":"四川","parent":"1"}, | |||||
| "25":{"name":"贵州","parent":"1"}, | |||||
| "26":{"name":"云南","parent":"1"}, | |||||
| "27":{"name":"西藏","parent":"1"}, | |||||
| "28":{"name":"陕西","parent":"1"}, | |||||
| "29":{"name":"甘肃","parent":"1"}, | |||||
| "30":{"name":"青海","parent":"1"}, | |||||
| "31":{"name":"宁夏","parent":"1"}, | |||||
| "32":{"name":"新疆","parent":"1"}, | |||||
| "33":{"name":"北京市","parent":"2"}, | |||||
| "34":{"name":"天津市","parent":"3"}, | |||||
| "35":{"name":"石家庄市","parent":"4"}, | |||||
| "36":{"name":"唐山市","parent":"4"}, | |||||
| "37":{"name":"秦皇岛市","parent":"4"}, | |||||
| "38":{"name":"邯郸市","parent":"4"}, | |||||
| "39":{"name":"邢台市","parent":"4"}, | |||||
| "40":{"name":"保定市","parent":"4"}, | |||||
| "41":{"name":"张家口市","parent":"4"}, | |||||
| "42":{"name":"承德市","parent":"4"}, | |||||
| "43":{"name":"沧州市","parent":"4"}, | |||||
| "44":{"name":"廊坊市","parent":"4"}, | |||||
| "45":{"name":"衡水市","parent":"4"}, | |||||
| "46":{"name":"太原市","parent":"5"}, | |||||
| "47":{"name":"大同市","parent":"5"}, | |||||
| "48":{"name":"阳泉市","parent":"5"}, | |||||
| "49":{"name":"长治市","parent":"5"}, | |||||
| "50":{"name":"晋城市","parent":"5"}, | |||||
| "51":{"name":"朔州市","parent":"5"}, | |||||
| "52":{"name":"晋中市","parent":"5"}, | |||||
| "53":{"name":"运城市","parent":"5"}, | |||||
| "54":{"name":"忻州市","parent":"5"}, | |||||
| "55":{"name":"临汾市","parent":"5"}, | |||||
| "56":{"name":"吕梁市","parent":"5"}, | |||||
| "57":{"name":"呼和浩特市","parent":"6"}, | |||||
| "58":{"name":"包头市","parent":"6"}, | |||||
| "59":{"name":"乌海市","parent":"6"}, | |||||
| "60":{"name":"赤峰市","parent":"6"}, | |||||
| "61":{"name":"通辽市","parent":"6"}, | |||||
| "62":{"name":"鄂尔多斯市","parent":"6"}, | |||||
| "63":{"name":"呼伦贝尔市","parent":"6"}, | |||||
| "64":{"name":"巴彦淖尔市","parent":"6"}, | |||||
| "65":{"name":"乌兰察布市","parent":"6"}, | |||||
| "66":{"name":"兴安盟","parent":"6"}, | |||||
| "67":{"name":"锡林郭勒盟","parent":"6"}, | |||||
| "68":{"name":"阿拉善盟","parent":"6"}, | |||||
| "69":{"name":"沈阳市","parent":"7"}, | |||||
| "70":{"name":"大连市","parent":"7"}, | |||||
| "71":{"name":"鞍山市","parent":"7"}, | |||||
| "72":{"name":"抚顺市","parent":"7"}, | |||||
| "73":{"name":"本溪市","parent":"7"}, | |||||
| "74":{"name":"丹东市","parent":"7"}, | |||||
| "75":{"name":"锦州市","parent":"7"}, | |||||
| "76":{"name":"营口市","parent":"7"}, | |||||
| "77":{"name":"阜新市","parent":"7"}, | |||||
| "78":{"name":"辽阳市","parent":"7"}, | |||||
| "79":{"name":"盘锦市","parent":"7"}, | |||||
| "80":{"name":"铁岭市","parent":"7"}, | |||||
| "81":{"name":"朝阳市","parent":"7"}, | |||||
| "82":{"name":"葫芦岛市","parent":"7"}, | |||||
| "83":{"name":"长春市","parent":"8"}, | |||||
| "84":{"name":"吉林市","parent":"8"}, | |||||
| "85":{"name":"四平市","parent":"8"}, | |||||
| "86":{"name":"辽源市","parent":"8"}, | |||||
| "87":{"name":"通化市","parent":"8"}, | |||||
| "88":{"name":"白山市","parent":"8"}, | |||||
| "89":{"name":"松原市","parent":"8"}, | |||||
| "90":{"name":"白城市","parent":"8"}, | |||||
| "91":{"name":"延边朝鲜族自治州","parent":"8"}, | |||||
| "92":{"name":"哈尔滨市","parent":"9"}, | |||||
| "93":{"name":"齐齐哈尔市","parent":"9"}, | |||||
| "94":{"name":"鸡西市","parent":"9"}, | |||||
| "95":{"name":"鹤岗市","parent":"9"}, | |||||
| "96":{"name":"双鸭山市","parent":"9"}, | |||||
| "97":{"name":"大庆市","parent":"9"}, | |||||
| "98":{"name":"伊春市","parent":"9"}, | |||||
| "99":{"name":"佳木斯市","parent":"9"}, | |||||
| "100":{"name":"七台河市","parent":"9"}, | |||||
| "101":{"name":"牡丹江市","parent":"9"}, | |||||
| "102":{"name":"黑河市","parent":"9"}, | |||||
| "103":{"name":"绥化市","parent":"9"}, | |||||
| "104":{"name":"大兴安岭地区","parent":"9"}, | |||||
| "105":{"name":"上海市","parent":"10"}, | |||||
| "106":{"name":"南京市","parent":"11"}, | |||||
| "107":{"name":"无锡市","parent":"11"}, | |||||
| "108":{"name":"徐州市","parent":"11"}, | |||||
| "109":{"name":"常州市","parent":"11"}, | |||||
| "110":{"name":"苏州市","parent":"11"}, | |||||
| "111":{"name":"南通市","parent":"11"}, | |||||
| "112":{"name":"连云港市","parent":"11"}, | |||||
| "113":{"name":"淮安市","parent":"11"}, | |||||
| "114":{"name":"盐城市","parent":"11"}, | |||||
| "115":{"name":"扬州市","parent":"11"}, | |||||
| "116":{"name":"镇江市","parent":"11"}, | |||||
| "117":{"name":"泰州市","parent":"11"}, | |||||
| "118":{"name":"宿迁市","parent":"11"}, | |||||
| "119":{"name":"杭州市","parent":"12"}, | |||||
| "120":{"name":"宁波市","parent":"12"}, | |||||
| "121":{"name":"温州市","parent":"12"}, | |||||
| "122":{"name":"嘉兴市","parent":"12"}, | |||||
| "123":{"name":"湖州市","parent":"12"}, | |||||
| "124":{"name":"绍兴市","parent":"12"}, | |||||
| "125":{"name":"金华市","parent":"12"}, | |||||
| "126":{"name":"衢州市","parent":"12"}, | |||||
| "127":{"name":"舟山市","parent":"12"}, | |||||
| "128":{"name":"台州市","parent":"12"}, | |||||
| "129":{"name":"丽水市","parent":"12"}, | |||||
| "130":{"name":"合肥市","parent":"13"}, | |||||
| "131":{"name":"芜湖市","parent":"13"}, | |||||
| "132":{"name":"蚌埠市","parent":"13"}, | |||||
| "133":{"name":"淮南市","parent":"13"}, | |||||
| "134":{"name":"马鞍山市","parent":"13"}, | |||||
| "135":{"name":"淮北市","parent":"13"}, | |||||
| "136":{"name":"铜陵市","parent":"13"}, | |||||
| "137":{"name":"安庆市","parent":"13"}, | |||||
| "138":{"name":"黄山市","parent":"13"}, | |||||
| "139":{"name":"滁州市","parent":"13"}, | |||||
| "140":{"name":"阜阳市","parent":"13"}, | |||||
| "141":{"name":"宿州市","parent":"13"}, | |||||
| "143":{"name":"六安市","parent":"13"}, | |||||
| "144":{"name":"亳州市","parent":"13"}, | |||||
| "145":{"name":"池州市","parent":"13"}, | |||||
| "146":{"name":"宣城市","parent":"13"}, | |||||
| "147":{"name":"福州市","parent":"14"}, | |||||
| "148":{"name":"厦门市","parent":"14"}, | |||||
| "149":{"name":"莆田市","parent":"14"}, | |||||
| "150":{"name":"三明市","parent":"14"}, | |||||
| "151":{"name":"泉州市","parent":"14"}, | |||||
| "152":{"name":"漳州市","parent":"14"}, | |||||
| "153":{"name":"南平市","parent":"14"}, | |||||
| "154":{"name":"龙岩市","parent":"14"}, | |||||
| "155":{"name":"宁德市","parent":"14"}, | |||||
| "156":{"name":"南昌市","parent":"15"}, | |||||
| "157":{"name":"景德镇市","parent":"15"}, | |||||
| "158":{"name":"萍乡市","parent":"15"}, | |||||
| "159":{"name":"九江市","parent":"15"}, | |||||
| "160":{"name":"新余市","parent":"15"}, | |||||
| "161":{"name":"鹰潭市","parent":"15"}, | |||||
| "162":{"name":"赣州市","parent":"15"}, | |||||
| "163":{"name":"吉安市","parent":"15"}, | |||||
| "164":{"name":"宜春市","parent":"15"}, | |||||
| "165":{"name":"抚州市","parent":"15"}, | |||||
| "166":{"name":"上饶市","parent":"15"}, | |||||
| "167":{"name":"济南市","parent":"16"}, | |||||
| "168":{"name":"青岛市","parent":"16"}, | |||||
| "169":{"name":"淄博市","parent":"16"}, | |||||
| "170":{"name":"枣庄市","parent":"16"}, | |||||
| "171":{"name":"东营市","parent":"16"}, | |||||
| "172":{"name":"烟台市","parent":"16"}, | |||||
| "173":{"name":"潍坊市","parent":"16"}, | |||||
| "174":{"name":"济宁市","parent":"16"}, | |||||
| "175":{"name":"泰安市","parent":"16"}, | |||||
| "176":{"name":"威海市","parent":"16"}, | |||||
| "177":{"name":"日照市","parent":"16"}, | |||||
| "179":{"name":"临沂市","parent":"16"}, | |||||
| "180":{"name":"德州市","parent":"16"}, | |||||
| "181":{"name":"聊城市","parent":"16"}, | |||||
| "182":{"name":"滨州市","parent":"16"}, | |||||
| "183":{"name":"菏泽市","parent":"16"}, | |||||
| "184":{"name":"郑州市","parent":"17"}, | |||||
| "185":{"name":"开封市","parent":"17"}, | |||||
| "186":{"name":"洛阳市","parent":"17"}, | |||||
| "187":{"name":"平顶山市","parent":"17"}, | |||||
| "188":{"name":"安阳市","parent":"17"}, | |||||
| "189":{"name":"鹤壁市","parent":"17"}, | |||||
| "190":{"name":"新乡市","parent":"17"}, | |||||
| "191":{"name":"焦作市","parent":"17"}, | |||||
| "192":{"name":"濮阳市","parent":"17"}, | |||||
| "193":{"name":"许昌市","parent":"17"}, | |||||
| "194":{"name":"漯河市","parent":"17"}, | |||||
| "195":{"name":"三门峡市","parent":"17"}, | |||||
| "196":{"name":"南阳市","parent":"17"}, | |||||
| "197":{"name":"商丘市","parent":"17"}, | |||||
| "198":{"name":"信阳市","parent":"17"}, | |||||
| "199":{"name":"周口市","parent":"17"}, | |||||
| "200":{"name":"驻马店市","parent":"17"}, | |||||
| "201":{"name":"武汉市","parent":"18"}, | |||||
| "202":{"name":"黄石市","parent":"18"}, | |||||
| "203":{"name":"十堰市","parent":"18"}, | |||||
| "204":{"name":"宜昌市","parent":"18"}, | |||||
| "205":{"name":"襄阳市","parent":"18"}, | |||||
| "206":{"name":"鄂州市","parent":"18"}, | |||||
| "207":{"name":"荆门市","parent":"18"}, | |||||
| "208":{"name":"孝感市","parent":"18"}, | |||||
| "209":{"name":"荆州市","parent":"18"}, | |||||
| "210":{"name":"黄冈市","parent":"18"}, | |||||
| "211":{"name":"咸宁市","parent":"18"}, | |||||
| "212":{"name":"随州市","parent":"18"}, | |||||
| "213":{"name":"恩施土家族苗族自治州","parent":"18"}, | |||||
| "215":{"name":"长沙市","parent":"19"}, | |||||
| "216":{"name":"株洲市","parent":"19"}, | |||||
| "217":{"name":"湘潭市","parent":"19"}, | |||||
| "218":{"name":"衡阳市","parent":"19"}, | |||||
| "219":{"name":"邵阳市","parent":"19"}, | |||||
| "220":{"name":"岳阳市","parent":"19"}, | |||||
| "221":{"name":"常德市","parent":"19"}, | |||||
| "222":{"name":"张家界市","parent":"19"}, | |||||
| "223":{"name":"益阳市","parent":"19"}, | |||||
| "224":{"name":"郴州市","parent":"19"}, | |||||
| "225":{"name":"永州市","parent":"19"}, | |||||
| "226":{"name":"怀化市","parent":"19"}, | |||||
| "227":{"name":"娄底市","parent":"19"}, | |||||
| "228":{"name":"湘西土家族苗族自治州","parent":"19"}, | |||||
| "229":{"name":"广州市","parent":"20"}, | |||||
| "230":{"name":"韶关市","parent":"20"}, | |||||
| "231":{"name":"深圳市","parent":"20"}, | |||||
| "232":{"name":"珠海市","parent":"20"}, | |||||
| "233":{"name":"汕头市","parent":"20"}, | |||||
| "234":{"name":"佛山市","parent":"20"}, | |||||
| "235":{"name":"江门市","parent":"20"}, | |||||
| "236":{"name":"湛江市","parent":"20"}, | |||||
| "237":{"name":"茂名市","parent":"20"}, | |||||
| "238":{"name":"肇庆市","parent":"20"}, | |||||
| "239":{"name":"惠州市","parent":"20"}, | |||||
| "240":{"name":"梅州市","parent":"20"}, | |||||
| "241":{"name":"汕尾市","parent":"20"}, | |||||
| "242":{"name":"河源市","parent":"20"}, | |||||
| "243":{"name":"阳江市","parent":"20"}, | |||||
| "244":{"name":"清远市","parent":"20"}, | |||||
| "245":{"name":"东莞市","parent":"20"}, | |||||
| "246":{"name":"中山市","parent":"20"}, | |||||
| "247":{"name":"潮州市","parent":"20"}, | |||||
| "248":{"name":"揭阳市","parent":"20"}, | |||||
| "249":{"name":"云浮市","parent":"20"}, | |||||
| "250":{"name":"南宁市","parent":"21"}, | |||||
| "251":{"name":"柳州市","parent":"21"}, | |||||
| "252":{"name":"桂林市","parent":"21"}, | |||||
| "253":{"name":"梧州市","parent":"21"}, | |||||
| "254":{"name":"北海市","parent":"21"}, | |||||
| "255":{"name":"防城港市","parent":"21"}, | |||||
| "256":{"name":"钦州市","parent":"21"}, | |||||
| "257":{"name":"贵港市","parent":"21"}, | |||||
| "258":{"name":"玉林市","parent":"21"}, | |||||
| "259":{"name":"百色市","parent":"21"}, | |||||
| "260":{"name":"贺州市","parent":"21"}, | |||||
| "261":{"name":"河池市","parent":"21"}, | |||||
| "262":{"name":"来宾市","parent":"21"}, | |||||
| "263":{"name":"崇左市","parent":"21"}, | |||||
| "264":{"name":"海口市","parent":"22"}, | |||||
| "265":{"name":"三亚市","parent":"22"}, | |||||
| "267":{"name":"重庆市","parent":"23"}, | |||||
| "268":{"name":"成都市","parent":"24"}, | |||||
| "269":{"name":"自贡市","parent":"24"}, | |||||
| "270":{"name":"攀枝花市","parent":"24"}, | |||||
| "271":{"name":"泸州市","parent":"24"}, | |||||
| "272":{"name":"德阳市","parent":"24"}, | |||||
| "273":{"name":"绵阳市","parent":"24"}, | |||||
| "274":{"name":"广元市","parent":"24"}, | |||||
| "275":{"name":"遂宁市","parent":"24"}, | |||||
| "276":{"name":"内江市","parent":"24"}, | |||||
| "277":{"name":"乐山市","parent":"24"}, | |||||
| "278":{"name":"南充市","parent":"24"}, | |||||
| "279":{"name":"眉山市","parent":"24"}, | |||||
| "280":{"name":"宜宾市","parent":"24"}, | |||||
| "281":{"name":"广安市","parent":"24"}, | |||||
| "282":{"name":"达州市","parent":"24"}, | |||||
| "283":{"name":"雅安市","parent":"24"}, | |||||
| "284":{"name":"巴中市","parent":"24"}, | |||||
| "285":{"name":"资阳市","parent":"24"}, | |||||
| "286":{"name":"阿坝藏族羌族自治州","parent":"24"}, | |||||
| "287":{"name":"甘孜藏族自治州","parent":"24"}, | |||||
| "288":{"name":"凉山彝族自治州","parent":"24"}, | |||||
| "289":{"name":"贵阳市","parent":"25"}, | |||||
| "290":{"name":"六盘水市","parent":"25"}, | |||||
| "291":{"name":"遵义市","parent":"25"}, | |||||
| "292":{"name":"安顺市","parent":"25"}, | |||||
| "293":{"name":"铜仁市","parent":"25"}, | |||||
| "294":{"name":"黔西南布依族苗族自治州","parent":"25"}, | |||||
| "295":{"name":"毕节市","parent":"25"}, | |||||
| "296":{"name":"黔东南苗族侗族自治州","parent":"25"}, | |||||
| "297":{"name":"黔南布依族苗族自治州","parent":"25"}, | |||||
| "298":{"name":"昆明市","parent":"26"}, | |||||
| "299":{"name":"曲靖市","parent":"26"}, | |||||
| "300":{"name":"玉溪市","parent":"26"}, | |||||
| "301":{"name":"保山市","parent":"26"}, | |||||
| "302":{"name":"昭通市","parent":"26"}, | |||||
| "303":{"name":"丽江市","parent":"26"}, | |||||
| "304":{"name":"普洱市","parent":"26"}, | |||||
| "305":{"name":"临沧市","parent":"26"}, | |||||
| "306":{"name":"楚雄彝族自治州","parent":"26"}, | |||||
| "307":{"name":"红河哈尼族彝族自治州","parent":"26"}, | |||||
| "308":{"name":"文山壮族苗族自治州","parent":"26"}, | |||||
| "309":{"name":"西双版纳傣族自治州","parent":"26"}, | |||||
| "310":{"name":"大理白族自治州","parent":"26"}, | |||||
| "311":{"name":"德宏傣族景颇族自治州","parent":"26"}, | |||||
| "312":{"name":"怒江傈僳族自治州","parent":"26"}, | |||||
| "313":{"name":"迪庆藏族自治州","parent":"26"}, | |||||
| "314":{"name":"拉萨市","parent":"27"}, | |||||
| "315":{"name":"昌都市","parent":"27"}, | |||||
| "316":{"name":"山南市","parent":"27"}, | |||||
| "317":{"name":"日喀则市","parent":"27"}, | |||||
| "318":{"name":"那曲市","parent":"27"}, | |||||
| "319":{"name":"阿里地区","parent":"27"}, | |||||
| "320":{"name":"林芝市","parent":"27"}, | |||||
| "321":{"name":"西安市","parent":"28"}, | |||||
| "322":{"name":"铜川市","parent":"28"}, | |||||
| "323":{"name":"宝鸡市","parent":"28"}, | |||||
| "324":{"name":"咸阳市","parent":"28"}, | |||||
| "325":{"name":"渭南市","parent":"28"}, | |||||
| "326":{"name":"延安市","parent":"28"}, | |||||
| "327":{"name":"汉中市","parent":"28"}, | |||||
| "328":{"name":"榆林市","parent":"28"}, | |||||
| "329":{"name":"安康市","parent":"28"}, | |||||
| "330":{"name":"商洛市","parent":"28"}, | |||||
| "331":{"name":"兰州市","parent":"29"}, | |||||
| "332":{"name":"嘉峪关市","parent":"29"}, | |||||
| "333":{"name":"金昌市","parent":"29"}, | |||||
| "334":{"name":"白银市","parent":"29"}, | |||||
| "335":{"name":"天水市","parent":"29"}, | |||||
| "336":{"name":"武威市","parent":"29"}, | |||||
| "337":{"name":"张掖市","parent":"29"}, | |||||
| "338":{"name":"平凉市","parent":"29"}, | |||||
| "339":{"name":"酒泉市","parent":"29"}, | |||||
| "340":{"name":"庆阳市","parent":"29"}, | |||||
| "341":{"name":"定西市","parent":"29"}, | |||||
| "342":{"name":"陇南市","parent":"29"}, | |||||
| "343":{"name":"临夏回族自治州","parent":"29"}, | |||||
| "344":{"name":"甘南藏族自治州","parent":"29"}, | |||||
| "345":{"name":"西宁市","parent":"30"}, | |||||
| "346":{"name":"海东市","parent":"30"}, | |||||
| "347":{"name":"海北藏族自治州","parent":"30"}, | |||||
| "348":{"name":"黄南藏族自治州","parent":"30"}, | |||||
| "349":{"name":"海南藏族自治州","parent":"30"}, | |||||
| "350":{"name":"果洛藏族自治州","parent":"30"}, | |||||
| "351":{"name":"玉树藏族自治州","parent":"30"}, | |||||
| "352":{"name":"海西蒙古族藏族自治州","parent":"30"}, | |||||
| "353":{"name":"银川市","parent":"31"}, | |||||
| "354":{"name":"石嘴山市","parent":"31"}, | |||||
| "355":{"name":"吴忠市","parent":"31"}, | |||||
| "356":{"name":"固原市","parent":"31"}, | |||||
| "357":{"name":"中卫市","parent":"31"}, | |||||
| "358":{"name":"乌鲁木齐市","parent":"32"}, | |||||
| "359":{"name":"克拉玛依市","parent":"32"}, | |||||
| "360":{"name":"吐鲁番市","parent":"32"}, | |||||
| "361":{"name":"哈密市","parent":"32"}, | |||||
| "362":{"name":"昌吉回族自治州","parent":"32"}, | |||||
| "363":{"name":"博尔塔拉蒙古自治州","parent":"32"}, | |||||
| "364":{"name":"巴音郭楞蒙古自治州","parent":"32"}, | |||||
| "365":{"name":"阿克苏地区","parent":"32"}, | |||||
| "366":{"name":"克孜勒苏柯尔克孜自治州","parent":"32"}, | |||||
| "367":{"name":"喀什地区","parent":"32"}, | |||||
| "368":{"name":"和田地区","parent":"32"}, | |||||
| "369":{"name":"伊犁哈萨克自治州","parent":"32"}, | |||||
| "370":{"name":"塔城地区","parent":"32"}, | |||||
| "371":{"name":"阿勒泰地区","parent":"32"}, | |||||
| "372":{"name":"新疆省直辖行政单位","parent":"32"}, | |||||
| "373":{"name":"可克达拉市","parent":"32"}, | |||||
| "374":{"name":"昆玉市","parent":"32"}, | |||||
| "375":{"name":"胡杨河市","parent":"32"}, | |||||
| "376":{"name":"双河市","parent":"32"}, | |||||
| "3560":{"name":"北票市","parent":"7"}, | |||||
| "3615":{"name":"高州市","parent":"20"}, | |||||
| "3651":{"name":"济源市","parent":"17"}, | |||||
| "3662":{"name":"胶南市","parent":"16"}, | |||||
| "3683":{"name":"老河口市","parent":"18"}, | |||||
| "3758":{"name":"沙河市","parent":"4"}, | |||||
| "3822":{"name":"宜城市","parent":"18"}, | |||||
| "3842":{"name":"枣阳市","parent":"18"}, | |||||
| "3850":{"name":"肇东市","parent":"9"}, | |||||
| "3905":{"name":"澳门","parent":"1"}, | |||||
| "3906":{"name":"澳门","parent":"3905"}, | |||||
| "3907":{"name":"香港","parent":"1"}, | |||||
| "3908":{"name":"香港","parent":"3907"}, | |||||
| "3947":{"name":"仙桃市","parent":"18"}, | |||||
| "3954":{"name":"台湾","parent":"1"}, | |||||
| "3955":{"name":"台湾","parent":"3954"}, | |||||
| "3956":{"name":"海外","parent":"1"}, | |||||
| "3957":{"name":"海外","parent":"3956"}, | |||||
| "3958":{"name":"美国","parent":"3956"}, | |||||
| "3959":{"name":"加拿大","parent":"3956"}, | |||||
| "3961":{"name":"日本","parent":"3956"}, | |||||
| "3962":{"name":"韩国","parent":"3956"}, | |||||
| "3963":{"name":"德国","parent":"3956"}, | |||||
| "3964":{"name":"英国","parent":"3956"}, | |||||
| "3965":{"name":"意大利","parent":"3956"}, | |||||
| "3966":{"name":"西班牙","parent":"3956"}, | |||||
| "3967":{"name":"法国","parent":"3956"}, | |||||
| "3968":{"name":"澳大利亚","parent":"3956"}, | |||||
| "3969":{"name":"东城区","parent":"2"}, | |||||
| "3970":{"name":"西城区","parent":"2"}, | |||||
| "3971":{"name":"崇文区","parent":"2"}, | |||||
| "3972":{"name":"宣武区","parent":"2"}, | |||||
| "3973":{"name":"朝阳区","parent":"2"}, | |||||
| "3974":{"name":"海淀区","parent":"2"}, | |||||
| "3975":{"name":"丰台区","parent":"2"}, | |||||
| "3976":{"name":"石景山区","parent":"2"}, | |||||
| "3977":{"name":"门头沟区","parent":"2"}, | |||||
| "3978":{"name":"房山区","parent":"2"}, | |||||
| "3979":{"name":"通州区","parent":"2"}, | |||||
| "3980":{"name":"顺义区","parent":"2"}, | |||||
| "3981":{"name":"昌平区","parent":"2"}, | |||||
| "3982":{"name":"大兴区","parent":"2"}, | |||||
| "3983":{"name":"平谷区","parent":"2"}, | |||||
| "3984":{"name":"怀柔区","parent":"2"}, | |||||
| "3985":{"name":"密云区","parent":"2"}, | |||||
| "3986":{"name":"延庆区","parent":"2"}, | |||||
| "3987":{"name":"黄浦区","parent":"10"}, | |||||
| "3988":{"name":"徐汇区","parent":"10"}, | |||||
| "3989":{"name":"长宁区","parent":"10"}, | |||||
| "3990":{"name":"静安区","parent":"10"}, | |||||
| "3991":{"name":"普陀区","parent":"10"}, | |||||
| "3992":{"name":"闸北区","parent":"10"}, | |||||
| "3993":{"name":"虹口区","parent":"10"}, | |||||
| "3994":{"name":"杨浦区","parent":"10"}, | |||||
| "3995":{"name":"宝山区","parent":"10"}, | |||||
| "3996":{"name":"闵行区","parent":"10"}, | |||||
| "3997":{"name":"嘉定区","parent":"10"}, | |||||
| "3998":{"name":"浦东新区","parent":"10"}, | |||||
| "3999":{"name":"松江区","parent":"10"}, | |||||
| "4000":{"name":"金山区","parent":"10"}, | |||||
| "4001":{"name":"青浦区","parent":"10"}, | |||||
| "4002":{"name":"奉贤区","parent":"10"}, | |||||
| "4003":{"name":"崇明区","parent":"10"}, | |||||
| "4004":{"name":"和平区","parent":"3"}, | |||||
| "4005":{"name":"河东区","parent":"3"}, | |||||
| "4006":{"name":"河西区","parent":"3"}, | |||||
| "4007":{"name":"南开区","parent":"3"}, | |||||
| "4008":{"name":"红桥区","parent":"3"}, | |||||
| "4009":{"name":"河北区","parent":"3"}, | |||||
| "4010":{"name":"滨海新区","parent":"3"}, | |||||
| "4011":{"name":"东丽区","parent":"3"}, | |||||
| "4012":{"name":"西青区","parent":"3"}, | |||||
| "4013":{"name":"北辰区","parent":"3"}, | |||||
| "4014":{"name":"津南区","parent":"3"}, | |||||
| "4015":{"name":"武清区","parent":"3"}, | |||||
| "4016":{"name":"宝坻区","parent":"3"}, | |||||
| "4017":{"name":"静海区","parent":"3"}, | |||||
| "4018":{"name":"宁河区","parent":"3"}, | |||||
| "4019":{"name":"蓟州区","parent":"3"}, | |||||
| "4020":{"name":"渝中区","parent":"23"}, | |||||
| "4021":{"name":"江北区","parent":"23"}, | |||||
| "4022":{"name":"南岸区","parent":"23"}, | |||||
| "4023":{"name":"沙坪坝区","parent":"23"}, | |||||
| "4024":{"name":"九龙坡区","parent":"23"}, | |||||
| "4025":{"name":"大渡口区","parent":"23"}, | |||||
| "4026":{"name":"渝北区","parent":"23"}, | |||||
| "4027":{"name":"巴南区","parent":"23"}, | |||||
| "4028":{"name":"北碚区","parent":"23"}, | |||||
| "4029":{"name":"万州区","parent":"23"}, | |||||
| "4030":{"name":"黔江区","parent":"23"}, | |||||
| "4031":{"name":"永川区","parent":"23"}, | |||||
| "4032":{"name":"涪陵区","parent":"23"}, | |||||
| "4033":{"name":"江津区","parent":"23"}, | |||||
| "4034":{"name":"合川区","parent":"23"}, | |||||
| "4035":{"name":"双桥区","parent":"23"}, | |||||
| "4036":{"name":"万盛区","parent":"23"}, | |||||
| "4037":{"name":"荣昌区","parent":"23"}, | |||||
| "4038":{"name":"大足区","parent":"23"}, | |||||
| "4039":{"name":"璧山区","parent":"23"}, | |||||
| "4040":{"name":"铜梁区","parent":"23"}, | |||||
| "4041":{"name":"潼南区","parent":"23"}, | |||||
| "4042":{"name":"綦江区","parent":"23"}, | |||||
| "4043":{"name":"忠县","parent":"23"}, | |||||
| "4044":{"name":"开州区","parent":"23"}, | |||||
| "4045":{"name":"云阳县","parent":"23"}, | |||||
| "4046":{"name":"梁平区","parent":"23"}, | |||||
| "4047":{"name":"垫江县","parent":"23"}, | |||||
| "4048":{"name":"丰都县","parent":"23"}, | |||||
| "4049":{"name":"奉节县","parent":"23"}, | |||||
| "4050":{"name":"巫山县","parent":"23"}, | |||||
| "4051":{"name":"巫溪县","parent":"23"}, | |||||
| "4052":{"name":"城口县","parent":"23"}, | |||||
| "4053":{"name":"武隆区","parent":"23"}, | |||||
| "4054":{"name":"石柱土家族自治县","parent":"23"}, | |||||
| "4055":{"name":"秀山土家族苗族自治县","parent":"23"}, | |||||
| "4056":{"name":"酉阳土家族苗族自治县","parent":"23"}, | |||||
| "4057":{"name":"彭水苗族土家族自治县","parent":"23"}, | |||||
| "4058":{"name":"潜江市","parent":"18"}, | |||||
| "4059":{"name":"三沙市","parent":"22"}, | |||||
| "4060":{"name":"石河子市","parent":"32"}, | |||||
| "4061":{"name":"阿拉尔市","parent":"32"}, | |||||
| "4062":{"name":"图木舒克市","parent":"32"}, | |||||
| "4063":{"name":"五家渠市","parent":"32"}, | |||||
| "4064":{"name":"北屯市","parent":"32"}, | |||||
| "4065":{"name":"铁门关市","parent":"32"}, | |||||
| "4066":{"name":"儋州市","parent":"22"}, | |||||
| "4067":{"name":"五指山市","parent":"22"}, | |||||
| "4068":{"name":"文昌市","parent":"22"}, | |||||
| "4069":{"name":"琼海市","parent":"22"}, | |||||
| "4070":{"name":"万宁市","parent":"22"}, | |||||
| "4072":{"name":"定安县","parent":"22"}, | |||||
| "4073":{"name":"屯昌县","parent":"22"}, | |||||
| "4074":{"name":"澄迈县","parent":"22"}, | |||||
| "4075":{"name":"临高县","parent":"22"}, | |||||
| "4076":{"name":"琼中黎族苗族自治县","parent":"22"}, | |||||
| "4077":{"name":"保亭黎族苗族自治县","parent":"22"}, | |||||
| "4078":{"name":"白沙黎族自治县","parent":"22"}, | |||||
| "4079":{"name":"昌江黎族自治县","parent":"22"}, | |||||
| "4080":{"name":"乐东黎族自治县","parent":"22"}, | |||||
| "4081":{"name":"陵水黎族自治县","parent":"22"}, | |||||
| "4082":{"name":"马来西亚","parent":"3956"}, | |||||
| "6047":{"name":"长寿区","parent":"23"}, | |||||
| "6857":{"name":"阿富汗","parent":"3956"}, | |||||
| "6858":{"name":"阿尔巴尼亚","parent":"3956"}, | |||||
| "6859":{"name":"阿尔及利亚","parent":"3956"}, | |||||
| "6860":{"name":"美属萨摩亚","parent":"3956"}, | |||||
| "6861":{"name":"安道尔","parent":"3956"}, | |||||
| "6862":{"name":"安哥拉","parent":"3956"}, | |||||
| "6863":{"name":"安圭拉","parent":"3956"}, | |||||
| "6864":{"name":"南极洲","parent":"3956"}, | |||||
| "6865":{"name":"安提瓜和巴布达","parent":"3956"}, | |||||
| "6866":{"name":"阿根廷","parent":"3956"}, | |||||
| "6867":{"name":"亚美尼亚","parent":"3956"}, | |||||
| "6869":{"name":"奥地利","parent":"3956"}, | |||||
| "6870":{"name":"阿塞拜疆","parent":"3956"}, | |||||
| "6871":{"name":"巴哈马","parent":"3956"}, | |||||
| "6872":{"name":"巴林","parent":"3956"}, | |||||
| "6873":{"name":"孟加拉国","parent":"3956"}, | |||||
| "6874":{"name":"巴巴多斯","parent":"3956"}, | |||||
| "6875":{"name":"白俄罗斯","parent":"3956"}, | |||||
| "6876":{"name":"比利时","parent":"3956"}, | |||||
| "6877":{"name":"伯利兹","parent":"3956"}, | |||||
| "6878":{"name":"贝宁","parent":"3956"}, | |||||
| "6879":{"name":"百慕大","parent":"3956"}, | |||||
| "6880":{"name":"不丹","parent":"3956"}, | |||||
| "6881":{"name":"玻利维亚","parent":"3956"}, | |||||
| "6882":{"name":"波黑","parent":"3956"}, | |||||
| "6883":{"name":"博茨瓦纳","parent":"3956"}, | |||||
| "6884":{"name":"布维岛","parent":"3956"}, | |||||
| "6885":{"name":"巴西","parent":"3956"}, | |||||
| "6886":{"name":"英属印度洋领土","parent":"3956"}, | |||||
| "6887":{"name":"文莱","parent":"3956"}, | |||||
| "6888":{"name":"保加利亚","parent":"3956"}, | |||||
| "6889":{"name":"布基纳法索","parent":"3956"}, | |||||
| "6890":{"name":"布隆迪","parent":"3956"}, | |||||
| "6891":{"name":"柬埔寨","parent":"3956"}, | |||||
| "6892":{"name":"喀麦隆","parent":"3956"}, | |||||
| "6893":{"name":"佛得角","parent":"3956"}, | |||||
| "6894":{"name":"开曼群岛","parent":"3956"}, | |||||
| "6895":{"name":"中非","parent":"3956"}, | |||||
| "6896":{"name":"乍得","parent":"3956"}, | |||||
| "6897":{"name":"智利","parent":"3956"}, | |||||
| "6898":{"name":"圣诞岛","parent":"3956"}, | |||||
| "6899":{"name":"科科斯(基林)群岛","parent":"3956"}, | |||||
| "6900":{"name":"哥伦比亚","parent":"3956"}, | |||||
| "6901":{"name":"科摩罗","parent":"3956"}, | |||||
| "6902":{"name":"刚果(布)","parent":"3956"}, | |||||
| "6903":{"name":"刚果(金)","parent":"3956"}, | |||||
| "6904":{"name":"库克群岛","parent":"3956"}, | |||||
| "6905":{"name":"哥斯达黎加","parent":"3956"}, | |||||
| "6906":{"name":"科特迪瓦","parent":"3956"}, | |||||
| "6907":{"name":"克罗地亚","parent":"3956"}, | |||||
| "6908":{"name":"古巴","parent":"3956"}, | |||||
| "6909":{"name":"塞浦路斯","parent":"3956"}, | |||||
| "6910":{"name":"捷克","parent":"3956"}, | |||||
| "6911":{"name":"丹麦","parent":"3956"}, | |||||
| "6912":{"name":"吉布提","parent":"3956"}, | |||||
| "6913":{"name":"多米尼克","parent":"3956"}, | |||||
| "6914":{"name":"多米尼加共和国","parent":"3956"}, | |||||
| "6915":{"name":"东帝汶","parent":"3956"}, | |||||
| "6916":{"name":"厄瓜多尔","parent":"3956"}, | |||||
| "6917":{"name":"埃及","parent":"3956"}, | |||||
| "6918":{"name":"萨尔瓦多","parent":"3956"}, | |||||
| "6919":{"name":"赤道几内亚","parent":"3956"}, | |||||
| "6920":{"name":"厄立特里亚","parent":"3956"}, | |||||
| "6921":{"name":"爱沙尼亚","parent":"3956"}, | |||||
| "6922":{"name":"埃塞俄比亚","parent":"3956"}, | |||||
| "6923":{"name":"福克兰群岛(马尔维纳斯)","parent":"3956"}, | |||||
| "6924":{"name":"法罗群岛","parent":"3956"}, | |||||
| "6925":{"name":"斐济","parent":"3956"}, | |||||
| "6926":{"name":"芬兰","parent":"3956"}, | |||||
| "6927":{"name":"法属圭亚那","parent":"3956"}, | |||||
| "6928":{"name":"法属波利尼西亚","parent":"3956"}, | |||||
| "6929":{"name":"法属南部领土","parent":"3956"}, | |||||
| "6930":{"name":"加蓬","parent":"3956"}, | |||||
| "6931":{"name":"冈比亚","parent":"3956"}, | |||||
| "6932":{"name":"格鲁吉亚","parent":"3956"}, | |||||
| "6933":{"name":"加纳","parent":"3956"}, | |||||
| "6934":{"name":"直布罗陀","parent":"3956"}, | |||||
| "6935":{"name":"希腊","parent":"3956"}, | |||||
| "6936":{"name":"格陵兰","parent":"3956"}, | |||||
| "6937":{"name":"格林纳达","parent":"3956"}, | |||||
| "6938":{"name":"瓜德罗普","parent":"3956"}, | |||||
| "6939":{"name":"关岛","parent":"3956"}, | |||||
| "6940":{"name":"危地马拉","parent":"3956"}, | |||||
| "6941":{"name":"几内亚","parent":"3956"}, | |||||
| "6942":{"name":"几内亚比绍","parent":"3956"}, | |||||
| "6943":{"name":"圭亚那","parent":"3956"}, | |||||
| "6944":{"name":"海地","parent":"3956"}, | |||||
| "6945":{"name":"赫德岛和麦克唐纳岛","parent":"3956"}, | |||||
| "6946":{"name":"洪都拉斯","parent":"3956"}, | |||||
| "6947":{"name":"匈牙利","parent":"3956"}, | |||||
| "6948":{"name":"冰岛","parent":"3956"}, | |||||
| "6949":{"name":"印度","parent":"3956"}, | |||||
| "6950":{"name":"印度尼西亚","parent":"3956"}, | |||||
| "6951":{"name":"伊朗","parent":"3956"}, | |||||
| "6952":{"name":"伊拉克","parent":"3956"}, | |||||
| "6953":{"name":"爱尔兰","parent":"3956"}, | |||||
| "6954":{"name":"以色列","parent":"3956"}, | |||||
| "6955":{"name":"牙买加","parent":"3956"}, | |||||
| "6956":{"name":"约旦","parent":"3956"}, | |||||
| "6957":{"name":"哈萨克斯坦","parent":"3956"}, | |||||
| "6958":{"name":"肯尼亚","parent":"3956"}, | |||||
| "6959":{"name":"基里巴斯","parent":"3956"}, | |||||
| "6960":{"name":"朝鲜","parent":"3956"}, | |||||
| "6961":{"name":"科威特","parent":"3956"}, | |||||
| "6962":{"name":"吉尔吉斯斯坦","parent":"3956"}, | |||||
| "6963":{"name":"老挝","parent":"3956"}, | |||||
| "6964":{"name":"拉脱维亚","parent":"3956"}, | |||||
| "6965":{"name":"黎巴嫩","parent":"3956"}, | |||||
| "6966":{"name":"莱索托","parent":"3956"}, | |||||
| "6967":{"name":"利比里亚","parent":"3956"}, | |||||
| "6968":{"name":"利比亚","parent":"3956"}, | |||||
| "6969":{"name":"列支敦士登","parent":"3956"}, | |||||
| "6970":{"name":"立陶宛","parent":"3956"}, | |||||
| "6971":{"name":"卢森堡","parent":"3956"}, | |||||
| "6972":{"name":"前南马其顿","parent":"3956"}, | |||||
| "6973":{"name":"马达加斯加","parent":"3956"}, | |||||
| "6974":{"name":"马拉维","parent":"3956"}, | |||||
| "6975":{"name":"马尔代夫","parent":"3956"}, | |||||
| "6976":{"name":"马里","parent":"3956"}, | |||||
| "6977":{"name":"马耳他","parent":"3956"}, | |||||
| "6978":{"name":"马绍尔群岛","parent":"3956"}, | |||||
| "6979":{"name":"马提尼克","parent":"3956"}, | |||||
| "6980":{"name":"毛里塔尼亚","parent":"3956"}, | |||||
| "6981":{"name":"毛里求斯","parent":"3956"}, | |||||
| "6982":{"name":"马约特","parent":"3956"}, | |||||
| "6983":{"name":"墨西哥","parent":"3956"}, | |||||
| "6984":{"name":"密克罗尼西亚联邦","parent":"3956"}, | |||||
| "6985":{"name":"摩尔多瓦","parent":"3956"}, | |||||
| "6986":{"name":"摩纳哥","parent":"3956"}, | |||||
| "6987":{"name":"蒙古","parent":"3956"}, | |||||
| "6988":{"name":"蒙特塞拉特","parent":"3956"}, | |||||
| "6989":{"name":"摩洛哥","parent":"3956"}, | |||||
| "6990":{"name":"莫桑比克","parent":"3956"}, | |||||
| "6991":{"name":"缅甸","parent":"3956"}, | |||||
| "6992":{"name":"纳米比亚","parent":"3956"}, | |||||
| "6993":{"name":"瑙鲁","parent":"3956"}, | |||||
| "6994":{"name":"尼泊尔","parent":"3956"}, | |||||
| "6995":{"name":"荷兰","parent":"3956"}, | |||||
| "6996":{"name":"荷属安的列斯","parent":"3956"}, | |||||
| "6997":{"name":"新喀里多尼亚","parent":"3956"}, | |||||
| "6998":{"name":"新西兰","parent":"3956"}, | |||||
| "6999":{"name":"尼加拉瓜","parent":"3956"}, | |||||
| "7000":{"name":"尼日尔","parent":"3956"}, | |||||
| "7001":{"name":"尼日利亚","parent":"3956"}, | |||||
| "7002":{"name":"纽埃","parent":"3956"}, | |||||
| "7003":{"name":"诺福克岛","parent":"3956"}, | |||||
| "7004":{"name":"北马里亚纳","parent":"3956"}, | |||||
| "7005":{"name":"挪威","parent":"3956"}, | |||||
| "7006":{"name":"阿曼","parent":"3956"}, | |||||
| "7007":{"name":"巴基斯坦","parent":"3956"}, | |||||
| "7008":{"name":"帕劳","parent":"3956"}, | |||||
| "7009":{"name":"巴勒斯坦","parent":"3956"}, | |||||
| "7010":{"name":"巴拿马","parent":"3956"}, | |||||
| "7011":{"name":"巴布亚新几内亚","parent":"3956"}, | |||||
| "7012":{"name":"巴拉圭","parent":"3956"}, | |||||
| "7013":{"name":"秘鲁","parent":"3956"}, | |||||
| "7014":{"name":"菲律宾","parent":"3956"}, | |||||
| "7015":{"name":"皮特凯恩群岛","parent":"3956"}, | |||||
| "7016":{"name":"波兰","parent":"3956"}, | |||||
| "7017":{"name":"葡萄牙","parent":"3956"}, | |||||
| "7018":{"name":"波多黎各","parent":"3956"}, | |||||
| "7019":{"name":"卡塔尔","parent":"3956"}, | |||||
| "7020":{"name":"留尼汪","parent":"3956"}, | |||||
| "7021":{"name":"罗马尼亚","parent":"3956"}, | |||||
| "7022":{"name":"俄罗斯联邦","parent":"3956"}, | |||||
| "7023":{"name":"卢旺达","parent":"3956"}, | |||||
| "7024":{"name":"圣赫勒拿","parent":"3956"}, | |||||
| "7025":{"name":"圣基茨和尼维斯","parent":"3956"}, | |||||
| "7026":{"name":"圣卢西亚","parent":"3956"}, | |||||
| "7027":{"name":"圣皮埃尔和密克隆","parent":"3956"}, | |||||
| "7028":{"name":"圣文森特和格林纳丁斯","parent":"3956"}, | |||||
| "7029":{"name":"萨摩亚","parent":"3956"}, | |||||
| "7030":{"name":"圣马力诺","parent":"3956"}, | |||||
| "7031":{"name":"圣多美和普林西比","parent":"3956"}, | |||||
| "7032":{"name":"沙特阿拉伯","parent":"3956"}, | |||||
| "7033":{"name":"塞内加尔","parent":"3956"}, | |||||
| "7034":{"name":"塞舌尔","parent":"3956"}, | |||||
| "7035":{"name":"塞拉利昂","parent":"3956"}, | |||||
| "7036":{"name":"新加坡","parent":"3956"}, | |||||
| "7037":{"name":"斯洛伐克","parent":"3956"}, | |||||
| "7038":{"name":"斯洛文尼亚","parent":"3956"}, | |||||
| "7039":{"name":"所罗门群岛","parent":"3956"}, | |||||
| "7040":{"name":"索马里","parent":"3956"}, | |||||
| "7041":{"name":"南非","parent":"3956"}, | |||||
| "7042":{"name":"南乔治亚岛和南桑德韦奇岛","parent":"3956"}, | |||||
| "7043":{"name":"斯里兰卡","parent":"3956"}, | |||||
| "7044":{"name":"苏丹","parent":"3956"}, | |||||
| "7045":{"name":"苏里南","parent":"3956"}, | |||||
| "7046":{"name":"斯瓦尔巴群岛","parent":"3956"}, | |||||
| "7047":{"name":"斯威士兰","parent":"3956"}, | |||||
| "7048":{"name":"瑞典","parent":"3956"}, | |||||
| "7049":{"name":"瑞士","parent":"3956"}, | |||||
| "7050":{"name":"叙利亚","parent":"3956"}, | |||||
| "7051":{"name":"塔吉克斯坦","parent":"3956"}, | |||||
| "7052":{"name":"坦桑尼亚","parent":"3956"}, | |||||
| "7053":{"name":"泰国","parent":"3956"}, | |||||
| "7054":{"name":"多哥","parent":"3956"}, | |||||
| "7055":{"name":"托克劳","parent":"3956"}, | |||||
| "7056":{"name":"汤加","parent":"3956"}, | |||||
| "7057":{"name":"特立尼达和多巴哥","parent":"3956"}, | |||||
| "7058":{"name":"突尼斯","parent":"3956"}, | |||||
| "7059":{"name":"土耳其","parent":"3956"}, | |||||
| "7060":{"name":"土库曼斯坦","parent":"3956"}, | |||||
| "7061":{"name":"特克斯科斯群岛","parent":"3956"}, | |||||
| "7062":{"name":"图瓦卢","parent":"3956"}, | |||||
| "7063":{"name":"乌干达","parent":"3956"}, | |||||
| "7064":{"name":"乌克兰","parent":"3956"}, | |||||
| "7065":{"name":"阿联酋","parent":"3956"}, | |||||
| "7066":{"name":"美国本土外小岛屿","parent":"3956"}, | |||||
| "7067":{"name":"乌拉圭","parent":"3956"}, | |||||
| "7068":{"name":"乌兹别克斯坦","parent":"3956"}, | |||||
| "7069":{"name":"瓦努阿图","parent":"3956"}, | |||||
| "7070":{"name":"梵蒂冈","parent":"3956"}, | |||||
| "7071":{"name":"委内瑞拉","parent":"3956"}, | |||||
| "7072":{"name":"越南","parent":"3956"}, | |||||
| "7073":{"name":"英属维尔京群岛","parent":"3956"}, | |||||
| "7074":{"name":"美属维尔京群岛","parent":"3956"}, | |||||
| "7075":{"name":"瓦利斯和富图纳","parent":"3956"}, | |||||
| "7076":{"name":"西撒哈拉","parent":"3956"}, | |||||
| "7077":{"name":"也门","parent":"3956"}, | |||||
| "7078":{"name":"南斯拉夫","parent":"3956"}, | |||||
| "7079":{"name":"赞比亚","parent":"3956"}, | |||||
| "7080":{"name":"津巴布韦","parent":"3956"}, | |||||
| "7081":{"name":"塞尔维亚","parent":"3956"}, | |||||
| "7082":{"name":"雄安新区","parent":"4"}, | |||||
| "7084":{"name":"天门市","parent":"18"} | |||||
| } | |||||
| NM_SET = set([v["name"] for _,v in TBL.items()]) | |||||
| def get_names(id): | |||||
| if not id or str(id).lower() == "none":return [] | |||||
| id = str(id) | |||||
| if not re.match("[0-9]+$", id.strip()):return [id] | |||||
| nms = [] | |||||
| d = TBL.get(id) | |||||
| if not d:return[] | |||||
| nms.append(d["name"]) | |||||
| p = get_names(d["parent"]) | |||||
| if p: nms.extend(p) | |||||
| return nms | |||||
| import re | |||||
| def isName(nm): | |||||
| if nm in NM_SET:return True | |||||
| if nm + "市" in NM_SET:return True | |||||
| if re.sub(r"(省|(回族|壮族|维吾尔)*自治区)$", "", nm) in NM_SET:return True | |||||
| return False |
| [ | |||||
| "科技", | |||||
| "集团", | |||||
| "网络科技", | |||||
| "技术", | |||||
| "信息", | |||||
| "分公司", | |||||
| "信息技术", | |||||
| "发展", | |||||
| "科技股份", | |||||
| "网络", | |||||
| "贸易", | |||||
| "商贸", | |||||
| "工程", | |||||
| "企业", | |||||
| "集团股份", | |||||
| "商务", | |||||
| "工业", | |||||
| "控股集团", | |||||
| "国际贸易", | |||||
| "软件技术", | |||||
| "数码科技", | |||||
| "软件开发", | |||||
| "有限", | |||||
| "经营", | |||||
| "科技开发", | |||||
| "股份公司", | |||||
| "电子技术", | |||||
| "实业集团", | |||||
| "责任", | |||||
| "无限", | |||||
| "工程技术", | |||||
| "上市公司", | |||||
| "技术开发", | |||||
| "软件系统", | |||||
| "总公司", | |||||
| "网络服务", | |||||
| "ltd.", | |||||
| "technology", | |||||
| "company", | |||||
| "服务公司", | |||||
| "计算机技术", | |||||
| "计算机软件", | |||||
| "电子信息", | |||||
| "corporation", | |||||
| "计算机服务", | |||||
| "计算机系统", | |||||
| "有限公司", | |||||
| "事业部", | |||||
| "公司", | |||||
| "股份", | |||||
| "有限责任", | |||||
| "软件", | |||||
| "控股", | |||||
| "高科技", | |||||
| "房地产", | |||||
| "事业群", | |||||
| "部门", | |||||
| "电子商务", | |||||
| "人力资源顾问", | |||||
| "人力资源", | |||||
| "株式会社", | |||||
| "网络营销" | |||||
| ] | |||||
| [ | |||||
| "google assistant investments", | |||||
| "amazon", | |||||
| "dingtalk china information", | |||||
| "zhejiang alibaba communication", | |||||
| "yunos", | |||||
| "腾讯云", | |||||
| "新浪新闻", | |||||
| "网邻通", | |||||
| "蚂蚁集团", | |||||
| "大疆", | |||||
| "恒生股份", | |||||
| "sf express", | |||||
| "智者天下", | |||||
| "shanghai hema network", | |||||
| "papayamobile", | |||||
| "lexinfintech", | |||||
| "industrial consumer finance", | |||||
| "360搜索", | |||||
| "世纪光速", | |||||
| "迅雷区块链", | |||||
| "赛盒科技", | |||||
| "齐力电子商务", | |||||
| "平安养老险", | |||||
| "平安证券", | |||||
| "平安好贷", | |||||
| "五八新服", | |||||
| "呯嘭智能", | |||||
| "阿里妈妈", | |||||
| "mdt", | |||||
| "tencent", | |||||
| "weibo", | |||||
| "浪潮软件", | |||||
| "阿里巴巴广告", | |||||
| "mashang consumer finance", | |||||
| "维沃", | |||||
| "hqg , limited", | |||||
| "moodys", | |||||
| "搜狐支付", | |||||
| "百度秀", | |||||
| "新浪服务", | |||||
| "零售通", | |||||
| "同城艺龙", | |||||
| "虾米音乐", | |||||
| "贝壳集团", | |||||
| "小米有品", | |||||
| "滴滴自动驾驶", | |||||
| "图记", | |||||
| "阿里影业", | |||||
| "卓联软件", | |||||
| "zhejiang tmall", | |||||
| "谷歌中国", | |||||
| "hithink flush", | |||||
| "时装科技", | |||||
| "程会玩国际旅行社", | |||||
| "amazon china holding limited", | |||||
| "中信消金", | |||||
| "当当比特物流", | |||||
| "新浪新媒体咨询", | |||||
| "tongcheng network", | |||||
| "金山在线", | |||||
| "shopping cart", | |||||
| "犀互动", | |||||
| "五八", | |||||
| "bilibili", | |||||
| "阿里星球", | |||||
| "滴滴金科服务", | |||||
| "美团", | |||||
| "哈啰出行", | |||||
| "face", | |||||
| "平安健康", | |||||
| "招商银行", | |||||
| "连亚", | |||||
| "盒马网络", | |||||
| "b站", | |||||
| "华为机器", | |||||
| "shanghai mdt infotech", | |||||
| "ping an healthkonnect", | |||||
| "beijing home link real estate broker", | |||||
| "花海仓", | |||||
| "beijing jingdong shangke information", | |||||
| "微影智能", | |||||
| "酷狗游戏", | |||||
| "health.pingan.com", | |||||
| "众安", | |||||
| "陌陌", | |||||
| "海康威视数字", | |||||
| "同程网", | |||||
| "艾丁金融", | |||||
| "知乎", | |||||
| " lu", | |||||
| "国际商业机器公司", | |||||
| "捷信消费金融", | |||||
| "恒生利融", | |||||
| "china merchants bank", | |||||
| "企鹅电竞", | |||||
| "捷信信驰", | |||||
| "360智能家居", | |||||
| "小桔车服", | |||||
| "homecredit", | |||||
| "皮皮虾", | |||||
| "畅游", | |||||
| "聚爱聊", | |||||
| "suning.com", | |||||
| "途牛旅游网", | |||||
| "花呗", | |||||
| "盈店通", | |||||
| "sina", | |||||
| "阿里巴巴音乐", | |||||
| "华为技术有限公司", | |||||
| "国付宝", | |||||
| "shanghai lianshang network", | |||||
| "oppo", | |||||
| "华为投资控股", | |||||
| "beijing sohu new media information", | |||||
| "times square", | |||||
| "菜鸟物流", | |||||
| "lingxing", | |||||
| "jd digits", | |||||
| "同程旅游", | |||||
| "分期乐", | |||||
| "火锅视频", | |||||
| "天天快报", | |||||
| "猎豹移动", | |||||
| "五八人力资源", | |||||
| "宝宝树", | |||||
| "顺丰科技", | |||||
| "上海西翠", | |||||
| "诗程文化传播", | |||||
| "dewu", | |||||
| "领星网络", | |||||
| "aliexpress", | |||||
| "贝塔通科技", | |||||
| "链家", | |||||
| "花小猪", | |||||
| "趣输入", | |||||
| "搜狐新媒体", | |||||
| "一淘", | |||||
| "56", | |||||
| "qq阅读", | |||||
| "青桔单车", | |||||
| "iflytek", | |||||
| "每日优鲜电子商务", | |||||
| "腾讯觅影", | |||||
| "微医", | |||||
| "松果网", | |||||
| "paypal", | |||||
| "递瑞供应链管理", | |||||
| "领星", | |||||
| "qunar", | |||||
| "三快", | |||||
| "lu.com", | |||||
| "携程旅行网", | |||||
| "新潮传媒", | |||||
| "链家经纪", | |||||
| "景域文化", | |||||
| "阿里健康", | |||||
| "pingpeng", | |||||
| "聚划算", | |||||
| "零机科技", | |||||
| "街兔电单车", | |||||
| "快乐购", | |||||
| "华为数字能源", | |||||
| "搜狐", | |||||
| "陆家嘴国际金融资产交易市场", | |||||
| "nanjing tuniu", | |||||
| "亚马逊", | |||||
| "苏宁易购", | |||||
| "携程旅游", | |||||
| "苏宁金服", | |||||
| "babytree", | |||||
| "悟空问答", | |||||
| "同花顺", | |||||
| "eastmoney", | |||||
| "浪潮信息", | |||||
| "滴滴智慧交通", | |||||
| "beijing ruixun lingtong", | |||||
| "平安综合金融服务", | |||||
| "爱奇艺", | |||||
| "小米集团", | |||||
| "华为云", | |||||
| "微店", | |||||
| "恒生集团", | |||||
| "网易有道", | |||||
| "boccfc", | |||||
| "世纪思速科技", | |||||
| "海康消防", | |||||
| "beijing xiaomi", | |||||
| "众安科技", | |||||
| "五八同城", | |||||
| "霆程汽车租赁", | |||||
| "云卖分销", | |||||
| "乐信集团", | |||||
| "蚂蚁", | |||||
| "舶乐蜜电子商务", | |||||
| "支付宝中国", | |||||
| "砖块消消消", | |||||
| "vivo", | |||||
| "阿里互娱", | |||||
| "中国平安", | |||||
| "lingxihudong", | |||||
| "百度网盘", | |||||
| "1号店", | |||||
| "字节跳动", | |||||
| "京东科技", | |||||
| "驴妈妈兴旅国际旅行社", | |||||
| "hangzhou alibaba music", | |||||
| "xunlei", | |||||
| "灵犀互动娱乐", | |||||
| "快手", | |||||
| "youtube", | |||||
| "连尚慧眼", | |||||
| "腾讯体育", | |||||
| "爱商在线", | |||||
| "酷我音乐", | |||||
| "金融壹账通", | |||||
| "搜狗服务", | |||||
| "banma information", | |||||
| "a站", | |||||
| "罗汉堂", | |||||
| "薇仕网络", | |||||
| "搜狐新闻", | |||||
| "贝宝", | |||||
| "薇仕", | |||||
| "口袋时尚科技", | |||||
| "穆迪咨询", | |||||
| "新狐投资管理", | |||||
| "hikvision", | |||||
| "alimama china holding limited", | |||||
| "超聚变数字", | |||||
| "腾讯视频", | |||||
| "恒生电子", | |||||
| "百度游戏", | |||||
| "绿洲", | |||||
| "木瓜移动", | |||||
| "红袖添香", | |||||
| "店匠科技", | |||||
| "易贝", | |||||
| "一淘网", | |||||
| "博览群书", | |||||
| "唯品会", | |||||
| "lazglobal", | |||||
| "amap", | |||||
| "芒果网", | |||||
| "口碑", | |||||
| "海康慧影", | |||||
| "腾讯音乐娱乐", | |||||
| "网易严选", | |||||
| "微信", | |||||
| "shenzhen lexin holding", | |||||
| "hangzhou pingpeng intelligent", | |||||
| "连尚网络", | |||||
| "海思", | |||||
| "isunor", | |||||
| "蝉翼", | |||||
| "阿里游戏", | |||||
| "广州优视", | |||||
| "优视", | |||||
| "腾讯征信", | |||||
| "识装", | |||||
| "finserve.pingan.com", | |||||
| "papaya", | |||||
| "阅文", | |||||
| "平安健康保险", | |||||
| "考拉海购", | |||||
| "网易印象", | |||||
| "wifi万能钥匙", | |||||
| "新浪互联服务", | |||||
| "亚马逊云科技", | |||||
| "迅雷看看", | |||||
| "华为朗新科技", | |||||
| "adyen hong kong limited", | |||||
| "谷歌", | |||||
| "得物", | |||||
| "网心", | |||||
| "cainiao network", | |||||
| "沐瞳", | |||||
| "linkedln", | |||||
| "hundsun", | |||||
| "阿里旅行", | |||||
| "珍爱网", | |||||
| "阿里巴巴通信", | |||||
| "金山奇剑", | |||||
| "tongtool", | |||||
| "华为安捷信电气", | |||||
| "快乐时代", | |||||
| "平安寿险", | |||||
| "微博", | |||||
| "微跳蚤", | |||||
| "oppo移动通信", | |||||
| "毒", | |||||
| "alimama", | |||||
| "shoplazza", | |||||
| "shenzhen dianjiang science and", | |||||
| "众鸣世科", | |||||
| "平安金融", | |||||
| "狐友", | |||||
| "维沃移动通信", | |||||
| "tobosoft", | |||||
| "齐力电商", | |||||
| "ali", | |||||
| "诚信通", | |||||
| "行吟", | |||||
| "跳舞的线", | |||||
| "橙心优选", | |||||
| "众安健康", | |||||
| "亚马逊中国投资", | |||||
| "德絮投资管理中心合伙", | |||||
| "招联消费金融", | |||||
| "百度文学", | |||||
| "芝麻信用", | |||||
| "阿里零售通", | |||||
| "时装", | |||||
| "花样直播", | |||||
| "sogou", | |||||
| "uc", | |||||
| "海思半导体", | |||||
| "zhongan online p&c insurance", | |||||
| "新浪数字", | |||||
| "驴妈妈旅游网", | |||||
| "华为数字能源技术", | |||||
| "京东数科", | |||||
| "oracle", | |||||
| "xiaomi", | |||||
| "nyse", | |||||
| "阳光消费金融", | |||||
| "天天动听", | |||||
| "大众点评", | |||||
| "上海瑞家", | |||||
| "trustpass", | |||||
| "hundsun technologies", | |||||
| "美团小贷", | |||||
| "ebay", | |||||
| "通途", | |||||
| "tcl", | |||||
| "鸿蒙", | |||||
| "酷狗计算机", | |||||
| "品诺保险", | |||||
| "capitalg", | |||||
| "康盛创想", | |||||
| "58同城", | |||||
| "闲鱼", | |||||
| "微软", | |||||
| "吉易付科技", | |||||
| "理财通", | |||||
| "ctrip", | |||||
| "yy", | |||||
| "华为数字", | |||||
| "kingsoft", | |||||
| "孙宁金融", | |||||
| "房江湖经纪", | |||||
| "youku", | |||||
| "ant financial services group", | |||||
| "盒马", | |||||
| "sensetime", | |||||
| "伊千网络", | |||||
| "小豹ai翻译棒", | |||||
| "shopify", | |||||
| "前海微众银行", | |||||
| "qd", | |||||
| "gmail", | |||||
| "pingpong", | |||||
| "alibaba group holding limited", | |||||
| "捷信时空电子商务", | |||||
| "orientsec", | |||||
| "乔戈里管理咨询", | |||||
| "ant", | |||||
| "锐讯灵通", | |||||
| "兴业消费金融", | |||||
| "京东叁佰陆拾度电子商务", | |||||
| "新浪", | |||||
| "优酷土豆", | |||||
| "海康机器人", | |||||
| "美团单车", | |||||
| "海康存储", | |||||
| "领英", | |||||
| "阿里全球速卖通", | |||||
| "美菜网", | |||||
| "京邦达", | |||||
| "安居客", | |||||
| "阿里体育", | |||||
| "相互宝", | |||||
| "cloudwalk", | |||||
| "百度智能云", | |||||
| "贝壳", | |||||
| "酷狗", | |||||
| "sunshine consumer finance", | |||||
| "掌宜", | |||||
| "奇酷网", | |||||
| "核新同花顺", | |||||
| "阿里巴巴影业", | |||||
| "节创", | |||||
| "学而思网校", | |||||
| "速途", | |||||
| "途牛", | |||||
| "阿里云计算", | |||||
| "beijing sensetime", | |||||
| "alibaba cloud", | |||||
| "西瓜视频", | |||||
| "美团优选", | |||||
| "orient securities limited", | |||||
| "华为朗新", | |||||
| "店匠", | |||||
| "shanghai weishi network", | |||||
| "友盟", | |||||
| "飞猪旅行", | |||||
| "滴滴出行", | |||||
| "alipay", | |||||
| "mogu", | |||||
| "dangdang", | |||||
| "大麦网", | |||||
| "汉军智能系统", | |||||
| "百度地图", | |||||
| "货车帮", | |||||
| "狐狸金服", | |||||
| "众安在线保险经纪", | |||||
| "华为通信", | |||||
| "新浪支付", | |||||
| "zhihu", | |||||
| "alibaba cloud computing", | |||||
| "沙发视频", | |||||
| "金山软件", | |||||
| "ping an good doctor", | |||||
| "携程", | |||||
| "脉脉", | |||||
| "youku information beijing", | |||||
| "zhongan", | |||||
| "艾丁软件", | |||||
| "乒乓智能", | |||||
| "蘑菇街", | |||||
| "taobao", | |||||
| "华为技术服务", | |||||
| "仕承文化传播", | |||||
| "安捷信", | |||||
| "狐狸互联网小额贷款", | |||||
| "节点迅捷", | |||||
| "中国银行", | |||||
| "搜镇", | |||||
| "众安在线", | |||||
| "dingtalk", | |||||
| "云从科技", | |||||
| "beijing jingbangda trade", | |||||
| "moody s", | |||||
| "滚动的天空", | |||||
| "yl.pingan.com", | |||||
| "奇虎", | |||||
| "alihealth", | |||||
| "芒果tv", | |||||
| "lufax", | |||||
| "美团打车", | |||||
| "小桔", | |||||
| "贝壳找房网", | |||||
| "小米科技", | |||||
| "vips", | |||||
| "kindle", | |||||
| "亚马逊服务", | |||||
| "citic consumer finance", | |||||
| "微众", | |||||
| "搜狗智慧互联网医院", | |||||
| "盒马鲜生", | |||||
| "life.pinan.com", | |||||
| "ph.com.cn", | |||||
| "银联", | |||||
| "cmbchina", | |||||
| "平安金融科技咨询", | |||||
| "微保", | |||||
| "甲骨文中国", | |||||
| "飞书", | |||||
| "koubei shanghai information", | |||||
| "企鹅辅导", | |||||
| "斑马", | |||||
| "平安租赁", | |||||
| "云从", | |||||
| "马上消费", | |||||
| "hangzhou ali baba advertising", | |||||
| "金山", | |||||
| "赛盒", | |||||
| "科大讯飞", | |||||
| "金星创业投资", | |||||
| "平安国际融资租赁", | |||||
| "360你财富", | |||||
| "西山居", | |||||
| "shenzhen qianhai fourth paradigm data", | |||||
| "海思光电子", | |||||
| "猎户星空", | |||||
| "网易公司", | |||||
| "浪潮", | |||||
| "粒粒橙传媒", | |||||
| "招联金融", | |||||
| "100. me", | |||||
| "捷信信驰咨询", | |||||
| "唯品仓", | |||||
| "orient", | |||||
| "趣拿", | |||||
| "摩拜单车", | |||||
| "天猫精灵", | |||||
| "菜鸟", | |||||
| "豹小贩", | |||||
| "去哪儿", | |||||
| "米家", | |||||
| "哈啰单车", | |||||
| "搜狐体育", | |||||
| "shopify payments usa", | |||||
| "高德软件", | |||||
| "讯联智付", | |||||
| "乐信", | |||||
| "唯你搭", | |||||
| "第四范式", | |||||
| "菜鸟网络", | |||||
| "同程", | |||||
| "yy语音", | |||||
| "浪潮云", | |||||
| "东财", | |||||
| "淘宝", | |||||
| "寻梦", | |||||
| "citic securities limited", | |||||
| "青橙之旅", | |||||
| "阿里巴巴", | |||||
| "番茄小说", | |||||
| "上海亿贝", | |||||
| "inspur", | |||||
| "babytree inc", | |||||
| "海康智慧产业股权投资基金合伙合伙", | |||||
| "adyen", | |||||
| "艺龙", | |||||
| "蚂蚁金服", | |||||
| "平安金服", | |||||
| "百度百科", | |||||
| "unionpay", | |||||
| "当当", | |||||
| "阅文集团", | |||||
| "东方财富", | |||||
| "东方证券", | |||||
| "哈罗单车", | |||||
| "优酷", | |||||
| "海康", | |||||
| "alipay china network", | |||||
| "网商银行", | |||||
| "钧正", | |||||
| "property.pingan.com", | |||||
| "豹咖啡", | |||||
| "网易", | |||||
| "我爱cba", | |||||
| "theduapp", | |||||
| "360", | |||||
| "金山数字娱乐", | |||||
| "新浪阅读", | |||||
| "alibabagames", | |||||
| "顺丰", | |||||
| "支点商贸", | |||||
| "同程旅行", | |||||
| "citic securities", | |||||
| "ele.com", | |||||
| "tal", | |||||
| "fresh hema", | |||||
| "运满满", | |||||
| "贝壳网", | |||||
| "酷狗音乐", | |||||
| "鲜城", | |||||
| "360健康", | |||||
| "浪潮世科", | |||||
| "迅雷网络", | |||||
| "哔哩哔哩", | |||||
| "华为电动", | |||||
| "淘友天下", | |||||
| "华多网络", | |||||
| "xunlei networking technologies", | |||||
| "云杉", | |||||
| "当当网电子商务", | |||||
| "津虹网络", | |||||
| "wedoc cloud hangzhou holdings", | |||||
| "alisports shanghai", | |||||
| "旷视金智", | |||||
| "钉钉中国", | |||||
| "微影", | |||||
| "金山快快", | |||||
| "亿贝", | |||||
| "wedoc", | |||||
| "autonavi", | |||||
| "哈啰助力车", | |||||
| "google cloud", | |||||
| "新浪乐居", | |||||
| "京东股票", | |||||
| "搜狗智慧远程医疗中心", | |||||
| "中银消金", | |||||
| "merchants union consumer finance", | |||||
| "王者荣耀", | |||||
| "百度手机", | |||||
| "美团民宿", | |||||
| "kaola", | |||||
| "小屋", | |||||
| "金山网络", | |||||
| "来往", | |||||
| "顺丰速运", | |||||
| "腾讯课堂", | |||||
| "百度在线网络", | |||||
| "美团买菜", | |||||
| "威视汽车", | |||||
| "uc mobile", | |||||
| "来赞达", | |||||
| "平安健康医疗", | |||||
| "豹小秘", | |||||
| "尚网", | |||||
| "哈勃投资", | |||||
| " ping an insurance group of china ,", | |||||
| "小米", | |||||
| "360好药", | |||||
| "qq音乐", | |||||
| "lingxigames", | |||||
| "faceu激萌", | |||||
| "搜狗", | |||||
| "sohu", | |||||
| "满帮", | |||||
| "vipshop", | |||||
| "wishpost", | |||||
| "金山世游", | |||||
| "shanghai yibaimi network", | |||||
| "1688", | |||||
| "海康汽车", | |||||
| "顺丰控股", | |||||
| "华为", | |||||
| "妙镜vr", | |||||
| "paybkj.com", | |||||
| "hellobike", | |||||
| "豹来电", | |||||
| "京东", | |||||
| "驴妈妈", | |||||
| "momo", | |||||
| "平安健康险", | |||||
| "哈勃科技", | |||||
| "美菜", | |||||
| "众安在线财产保险", | |||||
| "海康威视", | |||||
| "east money information", | |||||
| "阿里云", | |||||
| "蝉游记", | |||||
| "余额宝", | |||||
| "屋客", | |||||
| "滴滴", | |||||
| "shopify international limited", | |||||
| "百度", | |||||
| "阿里健康中国", | |||||
| "阿里通信", | |||||
| "微梦创科", | |||||
| "微医云", | |||||
| "轻颜相机", | |||||
| "搜易居", | |||||
| "趣店集团", | |||||
| "美团云", | |||||
| "ant group", | |||||
| "金山云", | |||||
| "beijing express hand", | |||||
| "觅觅", | |||||
| "支付宝", | |||||
| "滴滴承信科技咨询服务", | |||||
| "拼多多", | |||||
| "众安运动", | |||||
| "乞力电商", | |||||
| "youcash", | |||||
| "唯品金融", | |||||
| "陆金所", | |||||
| "本地生活", | |||||
| "sz dji", | |||||
| "海康智能", | |||||
| "魔方网聘", | |||||
| "青藤大学", | |||||
| "international business machines", | |||||
| "学而思", | |||||
| "beijing zhongming century science and", | |||||
| "猎豹清理大师", | |||||
| "asinking", | |||||
| "高德", | |||||
| "苏宁", | |||||
| "优酷网", | |||||
| "艾丁", | |||||
| "中银消费金融", | |||||
| "京东健康", | |||||
| "五八教育", | |||||
| "pingpongx", | |||||
| "搜狐时尚", | |||||
| "阿里广告", | |||||
| "平安财险", | |||||
| "中邮消金", | |||||
| "etao", | |||||
| "怕怕", | |||||
| "nyse:cmcm", | |||||
| "华为培训中心", | |||||
| "高德地图", | |||||
| "云狐天下征信", | |||||
| "大疆创新", | |||||
| "连尚", | |||||
| "壹佰米", | |||||
| "康健公司", | |||||
| "iqiyi.com", | |||||
| "360安全云盘", | |||||
| "馒头直播", | |||||
| "淘友网", | |||||
| "东方赢家", | |||||
| "bank of china", | |||||
| "微众银行", | |||||
| "阿里巴巴国际站", | |||||
| "虾米", | |||||
| "去哪儿网", | |||||
| "ctrip travel network shanghai", | |||||
| "潇湘书院", | |||||
| "腾讯", | |||||
| "快乐阳光互动娱乐传媒", | |||||
| "迅雷", | |||||
| "weidian", | |||||
| "滴滴货运", | |||||
| "ping an puhui enterprise management", | |||||
| "新浪仓石基金销售", | |||||
| "搜狐焦点", | |||||
| "alibaba pictures", | |||||
| "wps", | |||||
| "平安", | |||||
| "lazmall", | |||||
| "百度开放平台", | |||||
| "兴业消金", | |||||
| " 珍爱网", | |||||
| "京东云", | |||||
| "小红书", | |||||
| "1688. com", | |||||
| "如视智数", | |||||
| "missfresh", | |||||
| "pazl.pingan.cn", | |||||
| "平安集团", | |||||
| "kugou", | |||||
| "懂车帝", | |||||
| "斑马智行", | |||||
| "浪潮集团", | |||||
| "netease hangzhou network", | |||||
| "pagd.net", | |||||
| "探探", | |||||
| "chinaliterature", | |||||
| "amazon亚马逊", | |||||
| "alphabet", | |||||
| "当当文创手工艺品电子商务", | |||||
| "五八邦", | |||||
| "shenzhen zhenai network information", | |||||
| "lingshoutong", | |||||
| "字节", | |||||
| "lvmama", | |||||
| "金山办公", | |||||
| "众安保险", | |||||
| "时装信息", | |||||
| "优视科技", | |||||
| "guangzhou kugou", | |||||
| "ibm", | |||||
| "滴滴打车", | |||||
| "beijing sogou information service", | |||||
| "megvii", | |||||
| "健谈哥", | |||||
| "cloudwalk group", | |||||
| "蜂联科技", | |||||
| "冬云", | |||||
| "京东尚科", | |||||
| "钢琴块2", | |||||
| "京东世纪", | |||||
| "商汤", | |||||
| "众鸣世纪", | |||||
| "腾讯音乐", | |||||
| "迅雷网文化", | |||||
| "华为云计算技术", | |||||
| "live.me", | |||||
| "全球速卖通", | |||||
| "快的打车", | |||||
| "hello group inc", | |||||
| "美丽说", | |||||
| "suning", | |||||
| "opengauss", | |||||
| "lazada", | |||||
| "tmall", | |||||
| "acfun", | |||||
| "当当网", | |||||
| "中银", | |||||
| "旷视科技", | |||||
| "百度钱包", | |||||
| "淘宝网", | |||||
| "新浪微博", | |||||
| "迅雷集团", | |||||
| "中信消费金融", | |||||
| "学而思教育", | |||||
| "平安普惠", | |||||
| "悟空跨境", | |||||
| "irobotbox", | |||||
| "平安产险", | |||||
| "inspur group", | |||||
| "世纪卓越快递服务", | |||||
| "奇虎360", | |||||
| "webank", | |||||
| "偶藻", | |||||
| "唯品支付", | |||||
| "腾讯云计算", | |||||
| "众安服务", | |||||
| "亿之唐", | |||||
| "beijing 58 information ttechnology", | |||||
| "平安好医生", | |||||
| "迅雷之锤", | |||||
| "旅行小账本", | |||||
| "芒果游戏", | |||||
| "新浪传媒", | |||||
| "旷镜博煊", | |||||
| "全民k歌", | |||||
| "滴滴支付", | |||||
| "北京网心科技", | |||||
| "挂号网", | |||||
| "萤石", | |||||
| "chinavision media group limited", | |||||
| "猎豹安全大师", | |||||
| "cmcm", | |||||
| "趣店", | |||||
| "蚂蚁财富", | |||||
| "商汤科技", | |||||
| "甲骨文", | |||||
| "百度云", | |||||
| "百度apollo", | |||||
| "19 pay", | |||||
| "stock.pingan.com", | |||||
| "tiktok", | |||||
| "alibaba pictures group limited", | |||||
| "ele", | |||||
| "考拉", | |||||
| "天猫", | |||||
| "腾讯优图", | |||||
| "起点中文网", | |||||
| "百度视频", | |||||
| "shanghai bili bili", | |||||
| "京东物流", | |||||
| "ebay marketplaces gmbh", | |||||
| "alibaba sport", | |||||
| "wish", | |||||
| "阿里巴巴中国", | |||||
| "中国银联", | |||||
| "alibaba china network", | |||||
| "china ping an property insurance", | |||||
| "百度糯米网", | |||||
| "微软中国", | |||||
| "一九付", | |||||
| "4 paradigm", | |||||
| "叮咚买菜", | |||||
| "umeng", | |||||
| "众鸣科技", | |||||
| "平安财富通", | |||||
| "google", | |||||
| "巨量引擎", | |||||
| "百度贴吧", | |||||
| "beijing jingdong century information", | |||||
| "讯飞", | |||||
| "beijing yunshan information", | |||||
| "满运软件", | |||||
| "中邮消费金融", | |||||
| "饿了么", | |||||
| "alios", | |||||
| "腾讯ai实验室", | |||||
| "第四范式智能", | |||||
| "瀚星创业投资", | |||||
| "gradient ventures", | |||||
| "microsoft", | |||||
| "哈啰共享汽车", | |||||
| "乞力电子商务", | |||||
| "mscf", | |||||
| "网易影业文化", | |||||
| "铁友旅游咨询", | |||||
| "kilimall", | |||||
| "云企互联投资", | |||||
| "ping an financial consulting", | |||||
| "beijng jingdong century commerce", | |||||
| "高德威智能交通系统", | |||||
| "中友信息", | |||||
| "平安医疗健康管理", | |||||
| "eciticcfc", | |||||
| "中信证券", | |||||
| "fliggy", | |||||
| "电子湾", | |||||
| "旷云金智", | |||||
| "微粒贷", | |||||
| "rsi", | |||||
| "滴滴云计算", | |||||
| "google ventures", | |||||
| "箐程", | |||||
| "每日优鲜", | |||||
| "音兔", | |||||
| "拉扎斯", | |||||
| "今日头条", | |||||
| "乐信控股", | |||||
| "猎豹浏览器", | |||||
| "细微咨询", | |||||
| "好未来", | |||||
| "我乐", | |||||
| "绘声绘色", | |||||
| "抖音", | |||||
| "搜狐新时代", | |||||
| "飞猪", | |||||
| "鹅厂", | |||||
| "贝壳找房", | |||||
| "tuniu", | |||||
| "红马传媒文化", | |||||
| "钉钉", | |||||
| "马上消费金融", | |||||
| "360手机", | |||||
| "平安医保", | |||||
| "快途", | |||||
| "alibaba", | |||||
| "小哈换电", | |||||
| "大麦", | |||||
| "恒睿人工智能研究院", | |||||
| "谷歌资本", | |||||
| "猎豹", | |||||
| "穆迪信息" | |||||
| ] |
| [ | |||||
| "中国科技大学", | |||||
| "国防科学技术大学", | |||||
| "清华大学", | |||||
| "清华", | |||||
| "tsinghua university", | |||||
| "thu", | |||||
| "北京大学", | |||||
| "北大", | |||||
| "beijing university", | |||||
| "pku", | |||||
| "中国科学技术大学", | |||||
| "中国科大", | |||||
| "中科大", | |||||
| "china science & technology university", | |||||
| "ustc", | |||||
| "复旦大学", | |||||
| "复旦", | |||||
| "fudan university", | |||||
| "fdu", | |||||
| "中国人民大学", | |||||
| "人大", | |||||
| "人民大学", | |||||
| "renmin university of china", | |||||
| "ruc", | |||||
| "上海交通大学", | |||||
| "上海交大", | |||||
| "shanghai jiao tong university", | |||||
| "sjtu", | |||||
| "南京大学", | |||||
| "南大", | |||||
| "nanjing university", | |||||
| "nju", | |||||
| "同济大学", | |||||
| "同济", | |||||
| "tongji university", | |||||
| "tongji", | |||||
| "浙江大学", | |||||
| "浙大", | |||||
| "zhejiang university", | |||||
| "zju", | |||||
| "南开大学", | |||||
| "南开", | |||||
| "nankai university", | |||||
| "nku", | |||||
| "北京航空航天大学", | |||||
| "北航", | |||||
| "beihang university", | |||||
| "buaa", | |||||
| "北京师范大学", | |||||
| "北师", | |||||
| "北师大", | |||||
| "beijing normal university", | |||||
| "bnu", | |||||
| "武汉大学", | |||||
| "武大", | |||||
| "wuhan university", | |||||
| "whu", | |||||
| "西安交通大学", | |||||
| "西安交大", | |||||
| "xi’an jiaotong university", | |||||
| "xjtu", | |||||
| "天津大学", | |||||
| "天大", | |||||
| "university of tianjin", | |||||
| "tju", | |||||
| "华中科技大学", | |||||
| "华中大", | |||||
| "central china university science and technology", | |||||
| "hust", | |||||
| "北京理工大学", | |||||
| "北理", | |||||
| "beijing institute of technology", | |||||
| "bit", | |||||
| "东南大学", | |||||
| "东大", | |||||
| "southeast china university", | |||||
| "seu", | |||||
| "中山大学", | |||||
| "中大", | |||||
| "zhongshan university", | |||||
| "sysu", | |||||
| "华东师范大学", | |||||
| "华师大", | |||||
| "east china normal university", | |||||
| "ecnu", | |||||
| "哈尔滨工业大学", | |||||
| "哈工大", | |||||
| "harbin institute of technology", | |||||
| "hit", | |||||
| "厦门大学", | |||||
| "厦大", | |||||
| "xiamen university", | |||||
| "xmu", | |||||
| "西北工业大学", | |||||
| "西工大", | |||||
| "西北工大", | |||||
| "northwestern polytechnical university", | |||||
| "npu", | |||||
| "中南大学", | |||||
| "中南", | |||||
| "middle and southern university", | |||||
| "csu", | |||||
| "大连理工大学", | |||||
| "大工", | |||||
| "institute of technology of dalian", | |||||
| "dut", | |||||
| "四川大学", | |||||
| "川大", | |||||
| "sichuan university", | |||||
| "scu", | |||||
| "电子科技大学", | |||||
| "电子科大", | |||||
| "university of electronic science and technology of china", | |||||
| "uestc", | |||||
| "华南理工大学", | |||||
| "华南理工", | |||||
| "institutes of technology of south china", | |||||
| "scut", | |||||
| "吉林大学", | |||||
| "吉大", | |||||
| "jilin university", | |||||
| "jlu", | |||||
| "湖南大学", | |||||
| "湖大", | |||||
| "hunan university", | |||||
| "hnu", | |||||
| "重庆大学", | |||||
| "重大", | |||||
| "university of chongqing", | |||||
| "cqu", | |||||
| "山东大学", | |||||
| "山大", | |||||
| "shandong university", | |||||
| "sdu", | |||||
| "中国农业大学", | |||||
| "中国农大", | |||||
| "china agricultural university", | |||||
| "cau", | |||||
| "中国海洋大学", | |||||
| "中国海大", | |||||
| "chinese marine university", | |||||
| "ouc", | |||||
| "中央民族大学", | |||||
| "中央民大", | |||||
| "central university for nationalities", | |||||
| "muc", | |||||
| "东北大学", | |||||
| "东北工学院", | |||||
| "northeastern university", | |||||
| "neu 或 nu", | |||||
| "兰州大学", | |||||
| "兰大", | |||||
| "lanzhou university", | |||||
| "lzu", | |||||
| "西北农林科技大学", | |||||
| "西农","西北农大", | |||||
| "northwest a&f university", | |||||
| "nwafu", | |||||
| "中国人民解放军国防科技大学", | |||||
| "国防科技大学","国防科大", | |||||
| "national university of defense technology", | |||||
| "nudt", | |||||
| "郑州大学", | |||||
| "郑大", | |||||
| "zhengzhou university", | |||||
| "zzu", | |||||
| "云南大学", | |||||
| "云大", | |||||
| "yunnan university", | |||||
| "ynu", | |||||
| "新疆大学", | |||||
| "新大", | |||||
| "xinjiang university", | |||||
| "xju", | |||||
| "北京交通大学", | |||||
| "北京交大", | |||||
| "beijing jiaotong university", | |||||
| "bjtu", | |||||
| "北京工业大学", | |||||
| "北工大", | |||||
| "beijing university of technology", | |||||
| "bjut", | |||||
| "北京科技大学", | |||||
| "北科大","北京科大", | |||||
| "university of science and technology beijing", | |||||
| "ustb", | |||||
| "北京化工大学", | |||||
| "北化", | |||||
| "beijing university of chemical technology", | |||||
| "buct", | |||||
| "北京邮电大学", | |||||
| "北邮", | |||||
| "beijing university of posts and telecommunications", | |||||
| "beijing university of post and telecommunications", | |||||
| "beijing university of post and telecommunication", | |||||
| "beijing university of posts and telecommunication", | |||||
| "bupt", | |||||
| "北京林业大学", | |||||
| "北林", | |||||
| "beijing forestry university", | |||||
| "bfu", | |||||
| "北京协和医学院", | |||||
| "协和医学院", | |||||
| "peking union medical college", | |||||
| "pumc", | |||||
| "北京中医药大学", | |||||
| "北中医", | |||||
| "beijing university of chinese medicine", | |||||
| "bucm", | |||||
| "首都师范大学", | |||||
| "首师大", | |||||
| "capital normal university", | |||||
| "cnu", | |||||
| "北京外国语大学", | |||||
| "北外", | |||||
| "beijing foreign studies university", | |||||
| "bfsu", | |||||
| "中国传媒大学", | |||||
| "中媒", | |||||
| "中传", | |||||
| "北京广播学院", | |||||
| "communication university of china", | |||||
| "cuc", | |||||
| "中央财经大学", | |||||
| "中央财大", | |||||
| "中财大", | |||||
| "the central university of finance and economics", | |||||
| "cufe", | |||||
| "对外经济贸易大学", | |||||
| "对外经贸大学", | |||||
| "贸大", | |||||
| "university of international business and economics", | |||||
| "uibe", | |||||
| "外交学院", | |||||
| "外院", | |||||
| "china foreign affairs university", | |||||
| "cfau", | |||||
| "中国人民公安大学", | |||||
| "公安大学", | |||||
| "people's public security university of china", | |||||
| "ppsuc", | |||||
| "北京体育大学", | |||||
| "北体大", | |||||
| "beijing sport university", | |||||
| "bsu", | |||||
| "中央音乐学院", | |||||
| "央音", | |||||
| "中央院", | |||||
| "central conservatory of music", | |||||
| "ccom", | |||||
| "中国音乐学院", | |||||
| "国音", | |||||
| "中国院", | |||||
| "china conservatory of music", | |||||
| "ccmusic", | |||||
| "中央美术学院", | |||||
| "央美", | |||||
| "central academy of fine art", | |||||
| "cafa", | |||||
| "中央戏剧学院", | |||||
| "中戏", | |||||
| "the central academy of drama", | |||||
| "tcad", | |||||
| "中国政法大学", | |||||
| "法大", | |||||
| "china university of political science and law", | |||||
| "zuc", | |||||
| "cupl", | |||||
| "中国科学院大学", | |||||
| "国科大", | |||||
| "科院大", | |||||
| "university of chinese academy of sciences", | |||||
| "ucas", | |||||
| "福州大学", | |||||
| "福大", | |||||
| "university of fuzhou", | |||||
| "fzu", | |||||
| "暨南大学", | |||||
| "暨大", | |||||
| "ji'nan university", | |||||
| "jnu", | |||||
| "广州中医药大学", | |||||
| "广中医", | |||||
| "traditional chinese medicine university of guangzhou", | |||||
| "gucm", | |||||
| "华南师范大学", | |||||
| "华南师大", | |||||
| "south china normal university", | |||||
| "scnu", | |||||
| "广西大学", | |||||
| "西大", | |||||
| "guangxi university", | |||||
| "gxu", | |||||
| "贵州大学", | |||||
| "贵大", | |||||
| "guizhou university", | |||||
| "gzu", | |||||
| "海南大学", | |||||
| "海大", | |||||
| "university of hainan", | |||||
| "hainu", | |||||
| "河南大学", | |||||
| "河大", | |||||
| "he'nan university", | |||||
| "henu", | |||||
| "哈尔滨工程大学", | |||||
| "哈工程", | |||||
| "harbin engineering university", | |||||
| "heu", | |||||
| "东北农业大学", | |||||
| "东北农大", | |||||
| "northeast agricultural university", | |||||
| "neau", | |||||
| "东北林业大学", | |||||
| "东北林大", | |||||
| "northeast forestry university", | |||||
| "nefu", | |||||
| "中国地质大学", | |||||
| "地大", | |||||
| "china university of geosciences", | |||||
| "cug", | |||||
| "武汉理工大学", | |||||
| "武汉理工", | |||||
| "wuhan university of technology", | |||||
| "wut", | |||||
| "华中农业大学", | |||||
| "华中农大", | |||||
| "华农", | |||||
| "central china agricultural university", | |||||
| "hzau", | |||||
| "华中师范大学", | |||||
| "华中师大", | |||||
| "华大", | |||||
| "central china normal university", | |||||
| "ccnu", | |||||
| "中南财经政法大学", | |||||
| "中南大", | |||||
| "zhongnan university of economics & law", | |||||
| "zuel", | |||||
| "湖南师范大学", | |||||
| "湖南师大", | |||||
| "hunan normal university", | |||||
| "hunnu", | |||||
| "延边大学", | |||||
| "延大", | |||||
| "yanbian university", | |||||
| "ybu", | |||||
| "东北师范大学", | |||||
| "东北师大", | |||||
| "northeast normal university", | |||||
| "nenu", | |||||
| "苏州大学", | |||||
| "苏大", | |||||
| "soochow university", | |||||
| "suda", | |||||
| "南京航空航天大学", | |||||
| "南航", | |||||
| "nanjing aero-space university", | |||||
| "nuaa", | |||||
| "南京理工大学", | |||||
| "南理工", | |||||
| "institutes of technology of nanjing", | |||||
| "njust", | |||||
| "中国矿业大学", | |||||
| "中国矿大", | |||||
| "china mining university", | |||||
| "cumt", | |||||
| "南京邮电大学", | |||||
| "南邮", | |||||
| "nanjing university of posts and telecommunications", | |||||
| "njupt", | |||||
| "河海大学", | |||||
| "河海", | |||||
| "river sea university", | |||||
| "hhu", | |||||
| "江南大学", | |||||
| "江南大", | |||||
| "jiangnan university", | |||||
| "jiangnan", | |||||
| "南京林业大学", | |||||
| "南林", | |||||
| "nanjing forestry university", | |||||
| "njfu", | |||||
| "南京信息工程大学", | |||||
| "南信大", | |||||
| "nanjing university of information science and technology", | |||||
| "nuist", | |||||
| "南京农业大学", | |||||
| "南农", | |||||
| "南农大", | |||||
| "南京农大", | |||||
| "agricultural university of nanjing", | |||||
| "njau", | |||||
| "nau", | |||||
| "南京中医药大学", | |||||
| "南中医", | |||||
| "nanjing university of chinese medicine", | |||||
| "njucm", | |||||
| "中国药科大学", | |||||
| "中国药大", | |||||
| "china medicine university", | |||||
| "cpu", | |||||
| "南京师范大学", | |||||
| "南京师大", | |||||
| "南师大", | |||||
| "南师", | |||||
| "nanjing normal university", | |||||
| "nnu", | |||||
| "南昌大学", | |||||
| "昌大", | |||||
| "university of nanchang","nanchang university", | |||||
| "ncu", | |||||
| "辽宁大学", | |||||
| "辽大", | |||||
| "liaoning university", | |||||
| "lnu", | |||||
| "大连海事大学", | |||||
| "大连海大", | |||||
| "海大", | |||||
| "maritime affairs university of dalian", | |||||
| "dmu", | |||||
| "内蒙古大学", | |||||
| "内大", | |||||
| "university of the inner mongol","inner mongolia university", | |||||
| "imu", | |||||
| "宁夏大学", | |||||
| "宁大", | |||||
| "ningxia university", | |||||
| "nxu", | |||||
| "青海大学", | |||||
| "清大", | |||||
| "qinghai university", | |||||
| "qhu", | |||||
| "中国石油大学", | |||||
| "中石大", | |||||
| "china university of petroleum beijing", | |||||
| "upc", | |||||
| "太原理工大学", | |||||
| "太原理工", | |||||
| "institutes of technology of taiyuan","taiyuan university of technology", | |||||
| "tyut", | |||||
| "西北大学", | |||||
| "西大", | |||||
| "northwest university", | |||||
| "nwu", | |||||
| "西安电子科技大学", | |||||
| "西电", | |||||
| "xidian university", | |||||
| "xdu", | |||||
| "长安大学", | |||||
| "长大", | |||||
| "chang`an university", | |||||
| "chu", | |||||
| "陕西师范大学", | |||||
| "陕西师大", | |||||
| "陕师大", | |||||
| "shaanxi normal university", | |||||
| "snnu", | |||||
| "第四军医大学", | |||||
| "空军军医大学","四医大", | |||||
| "air force medical university", | |||||
| "fmmu", | |||||
| "华东理工大学", | |||||
| "华理", | |||||
| "east china university of science", | |||||
| "ecust", | |||||
| "东华大学", | |||||
| "东华", | |||||
| "donghua university", | |||||
| "dhu", | |||||
| "上海海洋大学", | |||||
| "上海海大", | |||||
| "shanghai ocean university", | |||||
| "shou", | |||||
| "上海中医药大学", | |||||
| "上中医", | |||||
| "shanghai university of traditional chinese medicine", | |||||
| "shutcm", | |||||
| "上海外国语大学", | |||||
| "上外", | |||||
| "shanghai international studies university", | |||||
| "sisu", | |||||
| "上海财经大学", | |||||
| "上海财大", | |||||
| "上财", | |||||
| "shanghai university of finance", | |||||
| "sufe", | |||||
| "上海体育学院", | |||||
| "shanghai university of sport", | |||||
| "上海音乐学院", | |||||
| "上音", | |||||
| "shanghai conservatory of music", | |||||
| "shcm", | |||||
| "上海大学", | |||||
| "上大", | |||||
| "shanghai university", | |||||
| "第二军医大学", | |||||
| "海军军医大学", | |||||
| "naval medical university", | |||||
| "西南交通大学", | |||||
| "西南交大", | |||||
| "southwest jiaotong university", | |||||
| "swjtu", | |||||
| "西南石油大学", | |||||
| "西南石大", | |||||
| "southwest petroleum university", | |||||
| "swpu", | |||||
| "成都理工大学", | |||||
| "成都理工", | |||||
| "chengdu university of technology", | |||||
| "cdut ", | |||||
| "四川农业大学", | |||||
| "川农", | |||||
| "川农大", | |||||
| "sichuan agricultural university", | |||||
| "sicau", | |||||
| "成都中医药大学", | |||||
| "成中医", | |||||
| "chengdu university of tcm", | |||||
| "cdutcm", | |||||
| "西南财经大学", | |||||
| "西南财大", | |||||
| "西财", | |||||
| "southwestern university of finance and economics", | |||||
| "swufe", | |||||
| "天津工业大学", | |||||
| "天工大", | |||||
| "tianjin university of technology", | |||||
| "tgu", | |||||
| "天津医科大学", | |||||
| "天津医大", | |||||
| "medical university of tianjin", | |||||
| "tmu", | |||||
| "天津中医药大学", | |||||
| "天中", | |||||
| "tianjin university of traditional chinese medicine", | |||||
| "tutcm", | |||||
| "华北电力大学", | |||||
| "华电", | |||||
| "north china electric power university", | |||||
| "ncepu", | |||||
| "河北工业大学", | |||||
| "河工大", | |||||
| "hebei university of technology", | |||||
| "hebut", | |||||
| "西藏大学", | |||||
| "藏大", | |||||
| "tibet university", | |||||
| "tu", | |||||
| "石河子大学", | |||||
| "石大", | |||||
| "shihezi university", | |||||
| "中国美术学院", | |||||
| "中国美院", | |||||
| "国美", | |||||
| "china academy of art", | |||||
| "caa", | |||||
| "宁波大学", | |||||
| "宁大", | |||||
| "ningbo university", | |||||
| "nbu", | |||||
| "西南大学", | |||||
| "西大", | |||||
| "southwest university", | |||||
| "swu", | |||||
| "安徽大学", | |||||
| "安大", | |||||
| "university of anhui", | |||||
| "ahu", | |||||
| "合肥工业大学", | |||||
| "合肥工大", | |||||
| "合工大", | |||||
| "hefei university of technology", | |||||
| "hfut", | |||||
| "中国地质大学", | |||||
| "地大", | |||||
| "china university of geosciences", | |||||
| "cug", | |||||
| "中国地质大学", | |||||
| "地大", | |||||
| "北京地大", | |||||
| "cugb", | |||||
| "中国矿业大学", | |||||
| "中国矿大", | |||||
| "china university of mining & technology", | |||||
| "cumtb", | |||||
| "中国石油大学", | |||||
| "中石大", | |||||
| "石大", | |||||
| "china university of petroleum", | |||||
| "cup", | |||||
| "中国石油大学", | |||||
| "中石大", | |||||
| "cup"] |
| # -*- coding: UTF-8 -*- | |||||
| import os, json,re,copy | |||||
| import pandas as pd | |||||
| current_file_path = os.path.dirname(os.path.abspath(__file__)) | |||||
| TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("") | |||||
| TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip()) | |||||
| GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r")) | |||||
| GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH]) | |||||
| def loadRank(fnm): | |||||
| global TBL | |||||
| TBL["rank"] = 1000000 | |||||
| with open(fnm, "r",encoding='UTF-8') as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l:break | |||||
| l = l.strip("\n").split(",") | |||||
| try: | |||||
| nm,rk = l[0].strip(),int(l[1]) | |||||
| #assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>" | |||||
| TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk | |||||
| except Exception as e: | |||||
| pass | |||||
| loadRank(os.path.join(current_file_path, "res/school.rank.csv")) | |||||
| def split(txt): | |||||
| tks = [] | |||||
| for t in re.sub(r"[ \t]+", " ",txt).split(" "): | |||||
| if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \ | |||||
| re.match(r"[a-zA-Z]", t) and tks: | |||||
| tks[-1] = tks[-1] + " " + t | |||||
| else:tks.append(t) | |||||
| return tks | |||||
| def select(nm): | |||||
| global TBL | |||||
| if not nm:return | |||||
| if isinstance(nm, list):nm = str(nm[0]) | |||||
| nm = split(nm)[0] | |||||
| nm = str(nm).lower().strip() | |||||
| nm = re.sub(r"[((][^()()]+[))]", "", nm.lower()) | |||||
| nm = re.sub(r"(^the |[,.&()();;·]+|^(英国|美国|瑞士))", "", nm) | |||||
| nm = re.sub(r"大学.*学院", "大学", nm) | |||||
| tbl = copy.deepcopy(TBL) | |||||
| tbl["hit_alias"] = tbl["alias"].map(lambda x:nm in set(x.split("+"))) | |||||
| res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | (tbl.hit_alias == True))] | |||||
| if res.empty:return | |||||
| return json.loads(res.to_json(orient="records"))[0] | |||||
| def is_good(nm): | |||||
| global GOOD_SCH | |||||
| nm = re.sub(r"[((][^()()]+[))]", "", nm.lower()) | |||||
| nm = re.sub(r"[''`‘’“”,. &()();;]+", "", nm) | |||||
| return nm in GOOD_SCH | |||||
| # -*- coding: utf-8 -*- | |||||
| import json | |||||
| from deepdoc.parser.resume.entities import degrees, regions, industries | |||||
| FIELDS = [ | |||||
| "address STRING", | |||||
| "annual_salary int", | |||||
| "annual_salary_from int", | |||||
| "annual_salary_to int", | |||||
| "birth STRING", | |||||
| "card STRING", | |||||
| "certificate_obj string", | |||||
| "city STRING", | |||||
| "corporation_id int", | |||||
| "corporation_name STRING", | |||||
| "corporation_type STRING", | |||||
| "degree STRING", | |||||
| "discipline_name STRING", | |||||
| "education_obj string", | |||||
| "email STRING", | |||||
| "expect_annual_salary int", | |||||
| "expect_city_names string", | |||||
| "expect_industry_name STRING", | |||||
| "expect_position_name STRING", | |||||
| "expect_salary_from int", | |||||
| "expect_salary_to int", | |||||
| "expect_type STRING", | |||||
| "gender STRING", | |||||
| "industry_name STRING", | |||||
| "industry_names STRING", | |||||
| "is_deleted STRING", | |||||
| "is_fertility STRING", | |||||
| "is_house STRING", | |||||
| "is_management_experience STRING", | |||||
| "is_marital STRING", | |||||
| "is_oversea STRING", | |||||
| "language_obj string", | |||||
| "name STRING", | |||||
| "nation STRING", | |||||
| "phone STRING", | |||||
| "political_status STRING", | |||||
| "position_name STRING", | |||||
| "project_obj string", | |||||
| "responsibilities string", | |||||
| "salary_month int", | |||||
| "scale STRING", | |||||
| "school_name STRING", | |||||
| "self_remark string", | |||||
| "skill_obj string", | |||||
| "title_name STRING", | |||||
| "tob_resume_id STRING", | |||||
| "updated_at Timestamp", | |||||
| "wechat STRING", | |||||
| "work_obj string", | |||||
| "work_experience int", | |||||
| "work_start_time BIGINT" | |||||
| ] | |||||
| def refactor(df): | |||||
| def deal_obj(obj, k, kk): | |||||
| if not isinstance(obj, type({})): | |||||
| return "" | |||||
| obj = obj.get(k, {}) | |||||
| if not isinstance(obj, type({})): | |||||
| return "" | |||||
| return obj.get(kk, "") | |||||
| def loadjson(line): | |||||
| try: | |||||
| return json.loads(line) | |||||
| except Exception as e: | |||||
| pass | |||||
| return {} | |||||
| df["obj"] = df["resume_content"].map(lambda x: loadjson(x)) | |||||
| df.fillna("", inplace=True) | |||||
| clms = ["tob_resume_id", "updated_at"] | |||||
| def extract(nms, cc=None): | |||||
| nonlocal clms | |||||
| clms.extend(nms) | |||||
| for c in nms: | |||||
| if cc: | |||||
| df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c)) | |||||
| else: | |||||
| df[c] = df["obj"].map( | |||||
| lambda x: json.dumps( | |||||
| x.get( | |||||
| c, | |||||
| {}), | |||||
| ensure_ascii=False) if isinstance( | |||||
| x, | |||||
| type( | |||||
| {})) and ( | |||||
| isinstance( | |||||
| x.get(c), | |||||
| type( | |||||
| {})) or not x.get(c)) else str(x).replace( | |||||
| "None", | |||||
| "")) | |||||
| extract(["education", "work", "certificate", "project", "language", | |||||
| "skill"]) | |||||
| extract(["wechat", "phone", "is_deleted", | |||||
| "name", "tel", "email"], "contact") | |||||
| extract(["nation", "expect_industry_name", "salary_month", | |||||
| "industry_ids", "is_house", "birth", "annual_salary_from", | |||||
| "annual_salary_to", "card", | |||||
| "expect_salary_to", "expect_salary_from", | |||||
| "expect_position_name", "gender", "city", | |||||
| "is_fertility", "expect_city_names", | |||||
| "political_status", "title_name", "expect_annual_salary", | |||||
| "industry_name", "address", "position_name", "school_name", | |||||
| "corporation_id", | |||||
| "is_oversea", "responsibilities", | |||||
| "work_start_time", "degree", "management_experience", | |||||
| "expect_type", "corporation_type", "scale", "corporation_name", | |||||
| "self_remark", "annual_salary", "work_experience", | |||||
| "discipline_name", "marital", "updated_at"], "basic") | |||||
| df["degree"] = df["degree"].map(lambda x: degrees.get_name(x)) | |||||
| df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x))) | |||||
| df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in | |||||
| str(x).split(",")])) | |||||
| clms.append("industry_names") | |||||
| def arr2str(a): | |||||
| if not a: | |||||
| return "" | |||||
| if isinstance(a, list): | |||||
| a = " ".join([str(i) for i in a]) | |||||
| return str(a).replace(",", " ") | |||||
| df["expect_industry_name"] = df["expect_industry_name"].map( | |||||
| lambda x: arr2str(x)) | |||||
| df["gender"] = df["gender"].map( | |||||
| lambda x: "男" if x == 'M' else ( | |||||
| "女" if x == 'F' else "")) | |||||
| for c in ["is_fertility", "is_oversea", "is_house", | |||||
| "management_experience", "marital"]: | |||||
| df[c] = df[c].map( | |||||
| lambda x: '是' if x == 'Y' else ( | |||||
| '否' if x == 'N' else "")) | |||||
| df["is_management_experience"] = df["management_experience"] | |||||
| df["is_marital"] = df["marital"] | |||||
| clms.extend(["is_management_experience", "is_marital"]) | |||||
| df.fillna("", inplace=True) | |||||
| for i in range(len(df)): | |||||
| if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip(): | |||||
| df.loc[i, "phone"] = df.loc[i, "tel"].strip() | |||||
| for n in ["industry_ids", "management_experience", "marital", "tel"]: | |||||
| for i in range(len(clms)): | |||||
| if clms[i] == n: | |||||
| del clms[i] | |||||
| break | |||||
| clms = list(set(clms)) | |||||
| df = df.reindex(sorted(clms), axis=1) | |||||
| #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL") | |||||
| for c in clms: | |||||
| df[c] = df[c].map( | |||||
| lambda s: str(s).replace( | |||||
| "\t", | |||||
| " ").replace( | |||||
| "\n", | |||||
| "\\n").replace( | |||||
| "\r", | |||||
| "\\n")) | |||||
| # print(df.values.tolist()) | |||||
| return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0])) |
| # -*- coding: utf-8 -*- | |||||
| import re, copy, time, datetime, demjson, \ | |||||
| traceback, signal | |||||
| import numpy as np | |||||
| from deepdoc.parser.resume.entities import degrees, schools, corporations | |||||
| from rag.nlp import huqie, surname | |||||
| from xpinyin import Pinyin | |||||
| from contextlib import contextmanager | |||||
| class TimeoutException(Exception): pass | |||||
| @contextmanager | |||||
| def time_limit(seconds): | |||||
| def signal_handler(signum, frame): | |||||
| raise TimeoutException("Timed out!") | |||||
| signal.signal(signal.SIGALRM, signal_handler) | |||||
| signal.alarm(seconds) | |||||
| try: | |||||
| yield | |||||
| finally: | |||||
| signal.alarm(0) | |||||
| ENV = None | |||||
| PY = Pinyin() | |||||
| def rmHtmlTag(line): | |||||
| return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE) | |||||
| def highest_degree(dg): | |||||
| if not dg: return "" | |||||
| if type(dg) == type(""): dg = [dg] | |||||
| m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8} | |||||
| return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0] | |||||
| def forEdu(cv): | |||||
| if not cv.get("education_obj"): | |||||
| cv["integerity_flt"] *= 0.8 | |||||
| return cv | |||||
| first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], [] | |||||
| edu_nst = [] | |||||
| edu_end_dt = "" | |||||
| cv["school_rank_int"] = 1000000 | |||||
| for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))): | |||||
| e = {} | |||||
| if n.get("end_time"): | |||||
| if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"] | |||||
| try: | |||||
| dt = n["end_time"] | |||||
| if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt) | |||||
| y, m, d = getYMD(dt) | |||||
| ed_dt.append(str(y)) | |||||
| e["end_dt_kwd"] = str(y) | |||||
| except Exception as e: | |||||
| pass | |||||
| if n.get("start_time"): | |||||
| try: | |||||
| dt = n["start_time"] | |||||
| if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt) | |||||
| y, m, d = getYMD(dt) | |||||
| st_dt.append(str(y)) | |||||
| e["start_dt_kwd"] = str(y) | |||||
| except Exception as e: | |||||
| pass | |||||
| r = schools.select(n.get("school_name", "")) | |||||
| if r: | |||||
| if str(r.get("type", "")) == "1": fea.append("211") | |||||
| if str(r.get("type", "")) == "2": fea.append("211") | |||||
| if str(r.get("is_abroad", "")) == "1": fea.append("留学") | |||||
| if str(r.get("is_double_first", "")) == "1": fea.append("双一流") | |||||
| if str(r.get("is_985", "")) == "1": fea.append("985") | |||||
| if str(r.get("is_world_known", "")) == "1": fea.append("海外知名") | |||||
| if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"] | |||||
| if n.get("school_name") and isinstance(n["school_name"], str): | |||||
| sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"])) | |||||
| e["sch_nm_kwd"] = sch[-1] | |||||
| fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1]) | |||||
| if n.get("discipline_name") and isinstance(n["discipline_name"], str): | |||||
| maj.append(n["discipline_name"]) | |||||
| e["major_kwd"] = n["discipline_name"] | |||||
| if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1" | |||||
| if n.get("degree"): | |||||
| d = degrees.get_name(n["degree"]) | |||||
| if d: e["degree_kwd"] = d | |||||
| if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", | |||||
| n.get( | |||||
| "school_name", | |||||
| ""))): d = "专升本" | |||||
| if d: deg.append(d) | |||||
| # for first degree | |||||
| if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]: | |||||
| fdeg = [d] | |||||
| if n.get("school_name"): fsch = [n["school_name"]] | |||||
| if n.get("discipline_name"): fmaj = [n["discipline_name"]] | |||||
| first_fea = copy.deepcopy(fea) | |||||
| edu_nst.append(e) | |||||
| cv["sch_rank_kwd"] = [] | |||||
| if cv["school_rank_int"] <= 20 \ | |||||
| or ("海外名校" in fea and cv["school_rank_int"] <= 200): | |||||
| cv["sch_rank_kwd"].append("顶尖学校") | |||||
| elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \ | |||||
| or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \ | |||||
| cv["school_rank_int"] > 200): | |||||
| cv["sch_rank_kwd"].append("精英学校") | |||||
| elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \ | |||||
| or ("海外名校" in fea and cv["school_rank_int"] > 500): | |||||
| cv["sch_rank_kwd"].append("优质学校") | |||||
| else: | |||||
| cv["sch_rank_kwd"].append("一般学校") | |||||
| if edu_nst: cv["edu_nst"] = edu_nst | |||||
| if fea: cv["edu_fea_kwd"] = list(set(fea)) | |||||
| if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea)) | |||||
| if maj: cv["major_kwd"] = maj | |||||
| if fsch: cv["first_school_name_kwd"] = fsch | |||||
| if fdeg: cv["first_degree_kwd"] = fdeg | |||||
| if fmaj: cv["first_major_kwd"] = fmaj | |||||
| if st_dt: cv["edu_start_kwd"] = st_dt | |||||
| if ed_dt: cv["edu_end_kwd"] = ed_dt | |||||
| if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt]) | |||||
| if deg: | |||||
| if "本科" in deg and "专科" in deg: | |||||
| deg.append("专升本") | |||||
| deg = [d for d in deg if d != '本科'] | |||||
| cv["degree_kwd"] = deg | |||||
| cv["highest_degree_kwd"] = highest_degree(deg) | |||||
| if edu_end_dt: | |||||
| try: | |||||
| if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt) | |||||
| if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today())) | |||||
| y, m, d = getYMD(edu_end_dt) | |||||
| cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000)) | |||||
| except Exception as e: | |||||
| print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt")) | |||||
| if sch: | |||||
| cv["school_name_kwd"] = sch | |||||
| if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \ | |||||
| or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \ | |||||
| or not cv.get("degree_kwd"): | |||||
| for c in sch: | |||||
| if schools.is_good(c): | |||||
| if "tag_kwd" not in cv: cv["tag_kwd"] = [] | |||||
| cv["tag_kwd"].append("好学校") | |||||
| cv["tag_kwd"].append("好学历") | |||||
| break | |||||
| if (len(cv.get("degree_kwd", [])) >= 1 and \ | |||||
| "本科" in cv["degree_kwd"] and \ | |||||
| any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \ | |||||
| or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \ | |||||
| or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]): | |||||
| if "tag_kwd" not in cv: cv["tag_kwd"] = [] | |||||
| if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历") | |||||
| if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj)) | |||||
| if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch)) | |||||
| if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch)) | |||||
| if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj)) | |||||
| return cv | |||||
| def forProj(cv): | |||||
| if not cv.get("project_obj"): return cv | |||||
| pro_nms, desc = [], [] | |||||
| for i, n in enumerate( | |||||
| sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "", | |||||
| reverse=True)): | |||||
| if n.get("name"): pro_nms.append(n["name"]) | |||||
| if n.get("describe"): desc.append(str(n["describe"])) | |||||
| if n.get("responsibilities"): desc.append(str(n["responsibilities"])) | |||||
| if n.get("achivement"): desc.append(str(n["achivement"])) | |||||
| if pro_nms: | |||||
| # cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms)) | |||||
| cv["project_name_tks"] = huqie.qie(pro_nms[0]) | |||||
| if desc: | |||||
| cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc))) | |||||
| cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0])) | |||||
| return cv | |||||
| def json_loads(line): | |||||
| return demjson.decode(re.sub(r": *(True|False)", r": '\1'", line)) | |||||
| def forWork(cv): | |||||
| if not cv.get("work_obj"): | |||||
| cv["integerity_flt"] *= 0.7 | |||||
| return cv | |||||
| flds = ["position_name", "corporation_name", "corporation_id", "responsibilities", | |||||
| "industry_name", "subordinates_count"] | |||||
| duas = [] | |||||
| scales = [] | |||||
| fea = {c: [] for c in flds} | |||||
| latest_job_tm = "" | |||||
| goodcorp = False | |||||
| goodcorp_ = False | |||||
| work_st_tm = "" | |||||
| corp_tags = [] | |||||
| for i, n in enumerate( | |||||
| sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "", | |||||
| reverse=True)): | |||||
| if type(n) == type(""): | |||||
| try: | |||||
| n = json_loads(n) | |||||
| except Exception as e: | |||||
| continue | |||||
| if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"] | |||||
| for c in flds: | |||||
| if not n.get(c) or str(n[c]) == '0': | |||||
| fea[c].append("") | |||||
| continue | |||||
| if c == "corporation_name": | |||||
| n[c] = corporations.corpNorm(n[c], False) | |||||
| if corporations.is_good(n[c]): | |||||
| if i == 0: | |||||
| goodcorp = True | |||||
| else: | |||||
| goodcorp_ = True | |||||
| ct = corporations.corp_tag(n[c]) | |||||
| if i == 0: | |||||
| corp_tags.extend(ct) | |||||
| elif ct and ct[0] != "软外": | |||||
| corp_tags.extend([f"{t}(曾)" for t in ct]) | |||||
| fea[c].append(rmHtmlTag(str(n[c]).lower())) | |||||
| y, m, d = getYMD(n.get("start_time")) | |||||
| if not y or not m: continue | |||||
| st = "%s-%02d-%02d" % (y, int(m), int(d)) | |||||
| latest_job_tm = st | |||||
| y, m, d = getYMD(n.get("end_time")) | |||||
| if (not y or not m) and i > 0: continue | |||||
| if not y or not m or int(y) > 2022: y, m, d = getYMD(str(n.get("updated_at", ""))) | |||||
| if not y or not m: continue | |||||
| ed = "%s-%02d-%02d" % (y, int(m), int(d)) | |||||
| try: | |||||
| duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days) | |||||
| except Exception as e: | |||||
| print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time")) | |||||
| if n.get("scale"): | |||||
| r = re.search(r"^([0-9]+)", str(n["scale"])) | |||||
| if r: scales.append(int(r.group(1))) | |||||
| if goodcorp: | |||||
| if "tag_kwd" not in cv: cv["tag_kwd"] = [] | |||||
| cv["tag_kwd"].append("好公司") | |||||
| if goodcorp_: | |||||
| if "tag_kwd" not in cv: cv["tag_kwd"] = [] | |||||
| cv["tag_kwd"].append("好公司(曾)") | |||||
| if corp_tags: | |||||
| if "tag_kwd" not in cv: cv["tag_kwd"] = [] | |||||
| cv["tag_kwd"].extend(corp_tags) | |||||
| cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)] | |||||
| if latest_job_tm: cv["latest_job_dt"] = latest_job_tm | |||||
| if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"] | |||||
| if fea["position_name"]: | |||||
| cv["position_name_tks"] = huqie.qie(fea["position_name"][0]) | |||||
| cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"]) | |||||
| cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:])) | |||||
| if fea["industry_name"]: | |||||
| cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0]) | |||||
| cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"]) | |||||
| cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:])) | |||||
| if fea["corporation_name"]: | |||||
| cv["corporation_name_kwd"] = fea["corporation_name"][0] | |||||
| cv["corp_nm_kwd"] = fea["corporation_name"] | |||||
| cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0]) | |||||
| cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"]) | |||||
| cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:])) | |||||
| if fea["responsibilities"]: | |||||
| cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0]) | |||||
| cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:])) | |||||
| if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if | |||||
| re.match(r"[^0-9]+$", str(i))] | |||||
| if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"]) | |||||
| if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])] | |||||
| if not cv.get("corporation_id"): cv["corporation_id"] = [] | |||||
| for i in cv.get("corporation_id", []): | |||||
| cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0) | |||||
| if work_st_tm: | |||||
| try: | |||||
| if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm) | |||||
| y, m, d = getYMD(work_st_tm) | |||||
| cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000)) | |||||
| except Exception as e: | |||||
| print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt")) | |||||
| cv["job_num_int"] = 0 | |||||
| if duas: | |||||
| cv["dua_flt"] = np.mean(duas) | |||||
| cv["cur_dua_int"] = duas[0] | |||||
| cv["job_num_int"] = len(duas) | |||||
| if scales: cv["scale_flt"] = np.max(scales) | |||||
| return cv | |||||
| def turnTm2Dt(b): | |||||
| if not b: return | |||||
| b = str(b).strip() | |||||
| if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10]))) | |||||
| return b | |||||
| def getYMD(b): | |||||
| y, m, d = "", "", "01" | |||||
| if not b: return (y, m, d) | |||||
| b = turnTm2Dt(b) | |||||
| if re.match(r"[0-9]{4}", b): y = int(b[:4]) | |||||
| r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b) | |||||
| if r: m = r.group(1) | |||||
| r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b) | |||||
| if r: d = r.group(1) | |||||
| if not d or int(d) == 0 or int(d) > 31: d = "1" | |||||
| if not m or int(m) > 12 or int(m) < 1: m = "1" | |||||
| return (y, m, d) | |||||
| def birth(cv): | |||||
| if not cv.get("birth"): | |||||
| cv["integerity_flt"] *= 0.9 | |||||
| return cv | |||||
| y, m, d = getYMD(cv["birth"]) | |||||
| if not m or not y: return cv | |||||
| b = "%s-%02d-%02d" % (y, int(m), int(d)) | |||||
| cv["birth_dt"] = b | |||||
| cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d)) | |||||
| cv["age_int"] = datetime.datetime.now().year - int(y) | |||||
| return cv | |||||
| def parse(cv): | |||||
| for k in cv.keys(): | |||||
| if cv[k] == '\\N': cv[k] = '' | |||||
| # cv = cv.asDict() | |||||
| tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names", | |||||
| "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name", | |||||
| "position_name", "school_name", "self_remark", "title_name"] | |||||
| small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"] | |||||
| kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email", | |||||
| "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name", | |||||
| "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"] | |||||
| num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from", | |||||
| "expect_salary_to", "salary_month"] | |||||
| is_fld = [ | |||||
| ("is_fertility", "已育", "未育"), | |||||
| ("is_house", "有房", "没房"), | |||||
| ("is_management_experience", "有管理经验", "无管理经验"), | |||||
| ("is_marital", "已婚", "未婚"), | |||||
| ("is_oversea", "有海外经验", "无海外经验") | |||||
| ] | |||||
| rmkeys = [] | |||||
| for k in cv.keys(): | |||||
| if cv[k] is None: rmkeys.append(k) | |||||
| if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k) | |||||
| for k in rmkeys: del cv[k] | |||||
| integerity = 0. | |||||
| flds_num = 0. | |||||
| def hasValues(flds): | |||||
| nonlocal integerity, flds_num | |||||
| flds_num += len(flds) | |||||
| for f in flds: | |||||
| v = str(cv.get(f, "")) | |||||
| if len(v) > 0 and v != '0' and v != '[]': integerity += 1 | |||||
| hasValues(tks_fld) | |||||
| hasValues(small_tks_fld) | |||||
| hasValues(kwd_fld) | |||||
| hasValues(num_fld) | |||||
| cv["integerity_flt"] = integerity / flds_num | |||||
| if cv.get("corporation_type"): | |||||
| for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""), | |||||
| (r"[//.· <\((]+.*", ""), | |||||
| (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"), | |||||
| (r".*(机关|事业).*", "机关"), | |||||
| (r".*(非盈利|Non-profit).*", "非盈利"), | |||||
| (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"), | |||||
| (r".*国有.*", "国企"), | |||||
| (r"[ ()\(\)人/·0-9-]+", ""), | |||||
| (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]: | |||||
| cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE) | |||||
| if len(cv["corporation_type"]) < 2: del cv["corporation_type"] | |||||
| if cv.get("political_status"): | |||||
| for p, r in [ | |||||
| (r".*党员.*", "党员"), | |||||
| (r".*(无党派|公民).*", "群众"), | |||||
| (r".*团员.*", "团员")]: | |||||
| cv["political_status"] = re.sub(p, r, cv["political_status"]) | |||||
| if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"] | |||||
| if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"])) | |||||
| keys = list(cv.keys()) | |||||
| for k in keys: | |||||
| # deal with json objects | |||||
| if k.find("_obj") > 0: | |||||
| try: | |||||
| cv[k] = json_loads(cv[k]) | |||||
| cv[k] = [a for _, a in cv[k].items()] | |||||
| nms = [] | |||||
| for n in cv[k]: | |||||
| if type(n) != type({}) or "name" not in n or not n.get("name"): continue | |||||
| n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower() | |||||
| if not n["name"]: continue | |||||
| nms.append(n["name"]) | |||||
| if nms: | |||||
| t = k[:-4] | |||||
| cv[f"{t}_kwd"] = nms | |||||
| cv[f"{t}_tks"] = huqie.qie(" ".join(nms)) | |||||
| except Exception as e: | |||||
| print("【EXCEPTION】:", str(traceback.format_exc()), cv[k]) | |||||
| cv[k] = [] | |||||
| # tokenize fields | |||||
| if k in tks_fld: | |||||
| cv[f"{k}_tks"] = huqie.qie(cv[k]) | |||||
| if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"]) | |||||
| # keyword fields | |||||
| if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower() | |||||
| for n in re.split(r"[\t,,;;. ]", | |||||
| re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k]) | |||||
| ) if n] | |||||
| if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k] | |||||
| cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "") | |||||
| # for name field | |||||
| if cv.get("name"): | |||||
| nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip()) | |||||
| nm = re.sub(r"[ \t ]+", " ", nm) | |||||
| if re.match(r"[a-zA-Z ]+$", nm): | |||||
| if len(nm.split(" ")) > 1: | |||||
| cv["name"] = nm | |||||
| else: | |||||
| nm = "" | |||||
| elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])): | |||||
| nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5]) | |||||
| else: | |||||
| nm = "" | |||||
| cv["name"] = nm.strip() | |||||
| name = cv["name"] | |||||
| # name pingyin and its prefix | |||||
| cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' ')) | |||||
| cv["name_py_pref0_tks"] = "" | |||||
| cv["name_py_pref_tks"] = "" | |||||
| for py in PY.get_pinyins(nm[:20], ''): | |||||
| for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i] | |||||
| for py in PY.get_pinyins(nm[:20], ' '): | |||||
| py = py.split(" ") | |||||
| for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i]) | |||||
| cv["name_kwd"] = name | |||||
| cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3] | |||||
| cv["name_tks"] = ( | |||||
| huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "") | |||||
| ) if name else "" | |||||
| else: | |||||
| cv["integerity_flt"] /= 2. | |||||
| if cv.get("phone"): | |||||
| r = re.search(r"(1[3456789][0-9]{9})", cv["phone"]) | |||||
| if not r: | |||||
| cv["phone"] = "" | |||||
| else: | |||||
| cv["phone"] = r.group(1) | |||||
| # deal with date fields | |||||
| if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime): | |||||
| cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S') | |||||
| else: | |||||
| y, m, d = getYMD(str(cv.get("updated_at", ""))) | |||||
| if not y: y = "2012" | |||||
| if not m: m = "01" | |||||
| if not d: d = "01" | |||||
| cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) | |||||
| # long text tokenize | |||||
| if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"])) | |||||
| # for yes or no field | |||||
| fea = [] | |||||
| for f, y, n in is_fld: | |||||
| if f not in cv: continue | |||||
| if cv[f] == '是': fea.append(y) | |||||
| if cv[f] == '否': fea.append(n) | |||||
| if fea: cv["tag_kwd"] = fea | |||||
| cv = forEdu(cv) | |||||
| cv = forProj(cv) | |||||
| cv = forWork(cv) | |||||
| cv = birth(cv) | |||||
| cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])] | |||||
| for i in range(len(cv["corp_proj_sch_deg_kwd"])): | |||||
| for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j | |||||
| for i in range(len(cv["corp_proj_sch_deg_kwd"])): | |||||
| if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"] | |||||
| try: | |||||
| if not cv.get("work_exp_flt") and cv.get("work_start_time"): | |||||
| if re.match(r"[0-9]{9,}", str(cv["work_start_time"])): | |||||
| cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"]) | |||||
| cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365. | |||||
| elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])): | |||||
| y, m, d = getYMD(str(cv["work_start_time"])) | |||||
| cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) | |||||
| cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y) | |||||
| except Exception as e: | |||||
| print("【EXCEPTION】", e, "==>", cv.get("work_start_time")) | |||||
| if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12. | |||||
| keys = list(cv.keys()) | |||||
| for k in keys: | |||||
| if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k] | |||||
| for k in cv.keys(): | |||||
| if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue | |||||
| cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']])) | |||||
| keys = [k for k in cv.keys() if re.search(r"_feas*$", k)] | |||||
| for k in keys: | |||||
| if cv[k] <= 0: del cv[k] | |||||
| cv["tob_resume_id"] = str(cv["tob_resume_id"]) | |||||
| cv["id"] = cv["tob_resume_id"] | |||||
| print("CCCCCCCCCCCCCCC") | |||||
| return dealWithInt64(cv) | |||||
| def dealWithInt64(d): | |||||
| if isinstance(d, dict): | |||||
| for n, v in d.items(): | |||||
| d[n] = dealWithInt64(v) | |||||
| if isinstance(d, list): | |||||
| d = [dealWithInt64(t) for t in d] | |||||
| if isinstance(d, np.integer): d = int(d) | |||||
| return d | |||||
| if not os.path.exists(model_file_path): | if not os.path.exists(model_file_path): | ||||
| raise ValueError("not find model file path {}".format( | raise ValueError("not find model file path {}".format( | ||||
| model_file_path)) | model_file_path)) | ||||
| sess = ort.InferenceSession(model_file_path) | |||||
| if ort.get_device() == "GPU": | |||||
| sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider']) | |||||
| else: | |||||
| sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider']) | |||||
| return sess, sess.get_inputs()[0] | return sess, sess.get_inputs()[0] | ||||
| # | # | ||||
| import copy | import copy | ||||
| import re | import re | ||||
| from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \ | |||||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | |||||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices | hierarchical_merge, make_colon_as_title, naive_merge, random_choices | ||||
| from rag.nlp import huqie | from rag.nlp import huqie | ||||
| from deepdoc.parser import PdfParser, DocxParser | from deepdoc.parser import PdfParser, DocxParser | ||||
| return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls | return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls | ||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||||
| """ | """ | ||||
| Supported file formats are docx, pdf, txt. | Supported file formats are docx, pdf, txt. | ||||
| Since a book is long and not all the parts are useful, if it's a PDF, | Since a book is long and not all the parts are useful, if it's a PDF, | ||||
| sections = [t for t, _ in sections] | sections = [t for t, _ in sections] | ||||
| # is it English | # is it English | ||||
| eng = is_english(random_choices(sections, k=218)) | |||||
| eng = lang.lower() == "english"#is_english(random_choices(sections, k=218)) | |||||
| res = [] | res = [] | ||||
| # add tables | # add tables |
| import re | import re | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from docx import Document | from docx import Document | ||||
| from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | |||||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | |||||
| make_colon_as_title | make_colon_as_title | ||||
| from rag.nlp import huqie | from rag.nlp import huqie | ||||
| from deepdoc.parser import PdfParser, DocxParser | from deepdoc.parser import PdfParser, DocxParser | ||||
| return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes] | return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes] | ||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||||
| """ | """ | ||||
| Supported file formats are docx, pdf, txt. | Supported file formats are docx, pdf, txt. | ||||
| """ | """ | ||||
| else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") | else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") | ||||
| # is it English | # is it English | ||||
| eng = is_english(sections) | |||||
| eng = lang.lower() == "english"#is_english(sections) | |||||
| # Remove 'Contents' part | # Remove 'Contents' part | ||||
| remove_contents_table(sections, eng) | remove_contents_table(sections, eng) | ||||
| import copy | import copy | ||||
| import re | import re | ||||
| from deepdoc.parser import tokenize | |||||
| from rag.nlp import huqie | |||||
| from rag.nlp import huqie, tokenize | |||||
| from deepdoc.parser import PdfParser | from deepdoc.parser import PdfParser | ||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls | return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls | ||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||||
| """ | """ | ||||
| Only pdf is supported. | Only pdf is supported. | ||||
| """ | """ | ||||
| doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) | doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) | ||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | ||||
| # is it English | # is it English | ||||
| eng = pdf_parser.is_english | |||||
| eng = lang.lower() == "english"#pdf_parser.is_english | |||||
| res = [] | res = [] | ||||
| # add tables | # add tables |
| import copy | import copy | ||||
| import re | import re | ||||
| from rag.app import laws | from rag.app import laws | ||||
| from deepdoc.parser import is_english, tokenize, naive_merge | |||||
| from rag.nlp import huqie | |||||
| from rag.nlp import huqie, is_english, tokenize, naive_merge | |||||
| from deepdoc.parser import PdfParser | from deepdoc.parser import PdfParser | ||||
| from rag.settings import cron_logger | from rag.settings import cron_logger | ||||
| return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes] | return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes] | ||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||||
| """ | """ | ||||
| Supported file formats are docx, pdf, txt. | Supported file formats are docx, pdf, txt. | ||||
| This method apply the naive ways to chunk files. | This method apply the naive ways to chunk files. | ||||
| parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"}) | parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"}) | ||||
| cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"]) | cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"]) | ||||
| eng = is_english(cks) | |||||
| eng = lang.lower() == "english"#is_english(cks) | |||||
| res = [] | res = [] | ||||
| # wrap up to es documents | # wrap up to es documents | ||||
| for ck in cks: | for ck in cks: |
| from collections import Counter | from collections import Counter | ||||
| from api.db import ParserType | from api.db import ParserType | ||||
| from deepdoc.parser import tokenize | |||||
| from rag.nlp import huqie | |||||
| from rag.nlp import huqie, tokenize | |||||
| from deepdoc.parser import PdfParser | from deepdoc.parser import PdfParser | ||||
| import numpy as np | import numpy as np | ||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| } | } | ||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||||
| """ | """ | ||||
| Only pdf is supported. | Only pdf is supported. | ||||
| The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. | The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. | ||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | ||||
| doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"]) | doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"]) | ||||
| # is it English | # is it English | ||||
| eng = pdf_parser.is_english | |||||
| eng = lang.lower() == "english"#pdf_parser.is_english | |||||
| print("It's English.....", eng) | print("It's English.....", eng) | ||||
| res = [] | res = [] |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import io | |||||
| import numpy as np | |||||
| from PIL import Image | |||||
| from api.db import LLMType | |||||
| from api.db.services.llm_service import LLMBundle | |||||
| from rag.nlp import tokenize | |||||
| from deepdoc.vision import OCR | |||||
| ocr = OCR() | |||||
| def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): | |||||
| try: | |||||
| cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang) | |||||
| except Exception as e: | |||||
| callback(prog=-1, msg=str(e)) | |||||
| return [] | |||||
| img = Image.open(io.BytesIO(binary)) | |||||
| doc = { | |||||
| "docnm_kwd": filename, | |||||
| "image": img | |||||
| } | |||||
| bxs = ocr(np.array(img)) | |||||
| txt = "\n".join([t[0] for _, t in bxs if t[0]]) | |||||
| eng = lang.lower() == "english" | |||||
| callback(0.4, "Finish OCR: (%s ...)" % txt[:12]) | |||||
| if (eng and len(txt.split(" ")) > 32) or len(txt) > 32: | |||||
| tokenize(doc, txt, eng) | |||||
| callback(0.8, "OCR results is too long to use CV LLM.") | |||||
| return [doc] | |||||
| try: | |||||
| callback(0.4, "Use CV LLM to describe the picture.") | |||||
| ans = cv_mdl.describe(binary) | |||||
| callback(0.8, "CV LLM respoond: %s ..." % ans[:32]) | |||||
| txt += "\n" + ans | |||||
| tokenize(doc, txt, eng) | |||||
| return [doc] | |||||
| except Exception as e: | |||||
| callback(prog=-1, msg=str(e)) | |||||
| return [] |
| import copy | import copy | ||||
| import re | import re | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from pptx import Presentation | |||||
| from deepdoc.parser import tokenize, is_english | |||||
| from rag.nlp import tokenize, is_english | |||||
| from rag.nlp import huqie | from rag.nlp import huqie | ||||
| from deepdoc.parser import PdfParser | |||||
| from deepdoc.parser import PdfParser, PptParser | |||||
| class Ppt(object): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| def __extract(self, shape): | |||||
| if shape.shape_type == 19: | |||||
| tb = shape.table | |||||
| rows = [] | |||||
| for i in range(1, len(tb.rows)): | |||||
| rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) | |||||
| return "\n".join(rows) | |||||
| if shape.has_text_frame: | |||||
| return shape.text_frame.text | |||||
| if shape.shape_type == 6: | |||||
| texts = [] | |||||
| for p in shape.shapes: | |||||
| t = self.__extract(p) | |||||
| if t: texts.append(t) | |||||
| return "\n".join(texts) | |||||
| class Ppt(PptParser): | |||||
| def __call__(self, fnm, from_page, to_page, callback=None): | def __call__(self, fnm, from_page, to_page, callback=None): | ||||
| ppt = Presentation(fnm) if isinstance( | |||||
| fnm, str) else Presentation( | |||||
| BytesIO(fnm)) | |||||
| txts = [] | |||||
| self.total_page = len(ppt.slides) | |||||
| for i, slide in enumerate(ppt.slides[from_page: to_page]): | |||||
| texts = [] | |||||
| for shape in slide.shapes: | |||||
| txt = self.__extract(shape) | |||||
| if txt: texts.append(txt) | |||||
| txts.append("\n".join(texts)) | |||||
| txts = super.__call__(fnm, from_page, to_page) | |||||
| callback(0.5, "Text extraction finished.") | callback(0.5, "Text extraction finished.") | ||||
| import aspose.slides as slides | import aspose.slides as slides |
| from io import BytesIO | from io import BytesIO | ||||
| from nltk import word_tokenize | from nltk import word_tokenize | ||||
| from openpyxl import load_workbook | from openpyxl import load_workbook | ||||
| from deepdoc.parser import is_english, random_choices | |||||
| from rag.nlp import is_english, random_choices | |||||
| from rag.nlp import huqie, stemmer | from rag.nlp import huqie, stemmer | ||||
| from deepdoc.parser import ExcelParser | from deepdoc.parser import ExcelParser | ||||
| return d | return d | ||||
| def chunk(filename, binary=None, callback=None, **kwargs): | |||||
| def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||||
| """ | """ | ||||
| Excel and csv(txt) format files are supported. | Excel and csv(txt) format files are supported. | ||||
| If the file is in excel format, there should be 2 column question and answer without header. | If the file is in excel format, there should be 2 column question and answer without header. | ||||
| break | break | ||||
| txt += l | txt += l | ||||
| lines = txt.split("\n") | lines = txt.split("\n") | ||||
| eng = is_english([rmPrefix(l) for l in lines[:100]]) | |||||
| eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]]) | |||||
| fails = [] | fails = [] | ||||
| for i, line in enumerate(lines): | for i, line in enumerate(lines): | ||||
| arr = [l for l in line.split("\t") if len(l) > 1] | arr = [l for l in line.split("\t") if len(l) > 1] |
| from dateutil.parser import parse as datetime_parse | from dateutil.parser import parse as datetime_parse | ||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| from deepdoc.parser import is_english, tokenize | |||||
| from rag.nlp import huqie | |||||
| from rag.nlp import huqie, is_english, tokenize | |||||
| from deepdoc.parser import ExcelParser | from deepdoc.parser import ExcelParser | ||||
| return arr, ty | return arr, ty | ||||
| def chunk(filename, binary=None, callback=None, **kwargs): | |||||
| def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||||
| """ | """ | ||||
| Excel and csv(txt) format files are supported. | Excel and csv(txt) format files are supported. | ||||
| For csv or txt file, the delimiter between columns is TAB. | For csv or txt file, the delimiter between columns is TAB. | ||||
| clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j]) | clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j]) | ||||
| for i in range(len(clmns))] | for i in range(len(clmns))] | ||||
| eng = is_english(txts) | |||||
| eng = lang.lower() == "english"#is_english(txts) | |||||
| for ii, row in df.iterrows(): | for ii, row in df.iterrows(): | ||||
| d = {} | d = {} | ||||
| row_txt = [] | row_txt = [] |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import io | |||||
| from abc import ABC | from abc import ABC | ||||
| from PIL import Image | |||||
| from openai import OpenAI | from openai import OpenAI | ||||
| import os | import os | ||||
| import base64 | import base64 | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from api.utils import get_uuid | |||||
| from api.utils.file_utils import get_project_base_directory | |||||
| class Base(ABC): | class Base(ABC): | ||||
| def __init__(self, key, model_name): | def __init__(self, key, model_name): | ||||
| { | { | ||||
| "role": "user", | "role": "user", | ||||
| "content": [ | "content": [ | ||||
| { | |||||
| "type": "text", | |||||
| "text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等。", | |||||
| }, | |||||
| { | { | ||||
| "type": "image_url", | "type": "image_url", | ||||
| "image_url": { | "image_url": { | ||||
| "url": f"data:image/jpeg;base64,{b64}" | "url": f"data:image/jpeg;base64,{b64}" | ||||
| }, | }, | ||||
| }, | }, | ||||
| { | |||||
| "text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \ | |||||
| "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.", | |||||
| }, | |||||
| ], | ], | ||||
| } | } | ||||
| ] | ] | ||||
| class GptV4(Base): | class GptV4(Base): | ||||
| def __init__(self, key, model_name="gpt-4-vision-preview"): | |||||
| def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese"): | |||||
| self.client = OpenAI(api_key=key) | self.client = OpenAI(api_key=key) | ||||
| self.model_name = model_name | self.model_name = model_name | ||||
| self.lang = lang | |||||
| def describe(self, image, max_tokens=300): | def describe(self, image, max_tokens=300): | ||||
| b64 = self.image2base64(image) | b64 = self.image2base64(image) | ||||
| class QWenCV(Base): | class QWenCV(Base): | ||||
| def __init__(self, key, model_name="qwen-vl-chat-v1"): | |||||
| def __init__(self, key, model_name="qwen-vl-chat-v1", lang="Chinese"): | |||||
| import dashscope | import dashscope | ||||
| dashscope.api_key = key | dashscope.api_key = key | ||||
| self.model_name = model_name | self.model_name = model_name | ||||
| self.lang = lang | |||||
| def prompt(self, binary): | |||||
| # stupid as hell | |||||
| tmp_dir = get_project_base_directory("tmp") | |||||
| if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) | |||||
| path = os.path.join(tmp_dir, "%s.jpg"%get_uuid()) | |||||
| Image.open(io.BytesIO(binary)).save(path) | |||||
| return [ | |||||
| { | |||||
| "role": "user", | |||||
| "content": [ | |||||
| { | |||||
| "image": f"file://{path}" | |||||
| }, | |||||
| { | |||||
| "text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \ | |||||
| "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.", | |||||
| }, | |||||
| ], | |||||
| } | |||||
| ] | |||||
| def describe(self, image, max_tokens=300): | def describe(self, image, max_tokens=300): | ||||
| from http import HTTPStatus | from http import HTTPStatus | ||||
| from dashscope import MultiModalConversation | from dashscope import MultiModalConversation | ||||
| response = MultiModalConversation.call(model=self.model_name, | response = MultiModalConversation.call(model=self.model_name, | ||||
| messages=self.prompt(self.image2base64(image))) | |||||
| messages=self.prompt(image)) | |||||
| if response.status_code == HTTPStatus.OK: | if response.status_code == HTTPStatus.OK: | ||||
| return response.output.choices[0]['message']['content'], response.usage.output_tokens | |||||
| return response.output.choices[0]['message']['content'][0]["text"], response.usage.output_tokens | |||||
| return response.message, 0 | return response.message, 0 | ||||
| class Zhipu4V(Base): | class Zhipu4V(Base): | ||||
| def __init__(self, key, model_name="glm-4v"): | |||||
| def __init__(self, key, model_name="glm-4v", lang="Chinese"): | |||||
| self.client = ZhipuAI(api_key=key) | self.client = ZhipuAI(api_key=key) | ||||
| self.model_name = model_name | self.model_name = model_name | ||||
| self.lang = lang | |||||
| def describe(self, image, max_tokens=1024): | def describe(self, image, max_tokens=1024): | ||||
| b64 = self.image2base64(image) | b64 = self.image2base64(image) |
| from nltk.stem import PorterStemmer | from nltk.stem import PorterStemmer | ||||
| stemmer = PorterStemmer() | stemmer = PorterStemmer() | ||||
| import re | |||||
| from nltk import word_tokenize | |||||
| from . import huqie | |||||
| from rag.utils import num_tokens_from_string | |||||
| import random | |||||
| BULLET_PATTERN = [[ | |||||
| r"第[零一二三四五六七八九十百0-9]+(分?编|部分)", | |||||
| r"第[零一二三四五六七八九十百0-9]+章", | |||||
| r"第[零一二三四五六七八九十百0-9]+节", | |||||
| r"第[零一二三四五六七八九十百0-9]+条", | |||||
| r"[\((][零一二三四五六七八九十百]+[\))]", | |||||
| ], [ | |||||
| r"第[0-9]+章", | |||||
| r"第[0-9]+节", | |||||
| r"[0-9]{,3}[\. 、]", | |||||
| r"[0-9]{,2}\.[0-9]{,2}", | |||||
| r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", | |||||
| r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", | |||||
| ], [ | |||||
| r"第[零一二三四五六七八九十百0-9]+章", | |||||
| r"第[零一二三四五六七八九十百0-9]+节", | |||||
| r"[零一二三四五六七八九十百]+[ 、]", | |||||
| r"[\((][零一二三四五六七八九十百]+[\))]", | |||||
| r"[\((][0-9]{,2}[\))]", | |||||
| ], [ | |||||
| r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", | |||||
| r"Chapter (I+V?|VI*|XI|IX|X)", | |||||
| r"Section [0-9]+", | |||||
| r"Article [0-9]+" | |||||
| ] | |||||
| ] | |||||
| def random_choices(arr, k): | |||||
| k = min(len(arr), k) | |||||
| return random.choices(arr, k=k) | |||||
| def bullets_category(sections): | |||||
| global BULLET_PATTERN | |||||
| hits = [0] * len(BULLET_PATTERN) | |||||
| for i, pro in enumerate(BULLET_PATTERN): | |||||
| for sec in sections: | |||||
| for p in pro: | |||||
| if re.match(p, sec): | |||||
| hits[i] += 1 | |||||
| break | |||||
| maxium = 0 | |||||
| res = -1 | |||||
| for i, h in enumerate(hits): | |||||
| if h <= maxium: continue | |||||
| res = i | |||||
| maxium = h | |||||
| return res | |||||
| def is_english(texts): | |||||
| eng = 0 | |||||
| for t in texts: | |||||
| if re.match(r"[a-zA-Z]{2,}", t.strip()): | |||||
| eng += 1 | |||||
| if eng / len(texts) > 0.8: | |||||
| return True | |||||
| return False | |||||
| def tokenize(d, t, eng): | |||||
| d["content_with_weight"] = t | |||||
| if eng: | |||||
| t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) | |||||
| d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)]) | |||||
| else: | |||||
| d["content_ltks"] = huqie.qie(t) | |||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | |||||
| def remove_contents_table(sections, eng=False): | |||||
| i = 0 | |||||
| while i < len(sections): | |||||
| def get(i): | |||||
| nonlocal sections | |||||
| return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip() | |||||
| if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", | |||||
| re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)): | |||||
| i += 1 | |||||
| continue | |||||
| sections.pop(i) | |||||
| if i >= len(sections): break | |||||
| prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) | |||||
| while not prefix: | |||||
| sections.pop(i) | |||||
| if i >= len(sections): break | |||||
| prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) | |||||
| sections.pop(i) | |||||
| if i >= len(sections) or not prefix: break | |||||
| for j in range(i, min(i + 128, len(sections))): | |||||
| if not re.match(prefix, get(j)): | |||||
| continue | |||||
| for _ in range(i, j): sections.pop(i) | |||||
| break | |||||
| def make_colon_as_title(sections): | |||||
| if not sections: return [] | |||||
| if type(sections[0]) == type(""): return sections | |||||
| i = 0 | |||||
| while i < len(sections): | |||||
| txt, layout = sections[i] | |||||
| i += 1 | |||||
| txt = txt.split("@")[0].strip() | |||||
| if not txt: | |||||
| continue | |||||
| if txt[-1] not in "::": | |||||
| continue | |||||
| txt = txt[::-1] | |||||
| arr = re.split(r"([。?!!?;;]| .)", txt) | |||||
| if len(arr) < 2 or len(arr[1]) < 32: | |||||
| continue | |||||
| sections.insert(i - 1, (arr[0][::-1], "title")) | |||||
| i += 1 | |||||
| def hierarchical_merge(bull, sections, depth): | |||||
| if not sections or bull < 0: return [] | |||||
| if type(sections[0]) == type(""): sections = [(s, "") for s in sections] | |||||
| sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())] | |||||
| bullets_size = len(BULLET_PATTERN[bull]) | |||||
| levels = [[] for _ in range(bullets_size + 2)] | |||||
| def not_title(txt): | |||||
| if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False | |||||
| if len(txt) >= 128: return True | |||||
| return re.search(r"[,;,。;!!]", txt) | |||||
| for i, (txt, layout) in enumerate(sections): | |||||
| for j, p in enumerate(BULLET_PATTERN[bull]): | |||||
| if re.match(p, txt.strip()) and not not_title(txt): | |||||
| levels[j].append(i) | |||||
| break | |||||
| else: | |||||
| if re.search(r"(title|head)", layout): | |||||
| levels[bullets_size].append(i) | |||||
| else: | |||||
| levels[bullets_size + 1].append(i) | |||||
| sections = [t for t, _ in sections] | |||||
| for s in sections: print("--", s) | |||||
| def binary_search(arr, target): | |||||
| if not arr: return -1 | |||||
| if target > arr[-1]: return len(arr) - 1 | |||||
| if target < arr[0]: return -1 | |||||
| s, e = 0, len(arr) | |||||
| while e - s > 1: | |||||
| i = (e + s) // 2 | |||||
| if target > arr[i]: | |||||
| s = i | |||||
| continue | |||||
| elif target < arr[i]: | |||||
| e = i | |||||
| continue | |||||
| else: | |||||
| assert False | |||||
| return s | |||||
| cks = [] | |||||
| readed = [False] * len(sections) | |||||
| levels = levels[::-1] | |||||
| for i, arr in enumerate(levels[:depth]): | |||||
| for j in arr: | |||||
| if readed[j]: continue | |||||
| readed[j] = True | |||||
| cks.append([j]) | |||||
| if i + 1 == len(levels) - 1: continue | |||||
| for ii in range(i + 1, len(levels)): | |||||
| jj = binary_search(levels[ii], j) | |||||
| if jj < 0: continue | |||||
| if jj > cks[-1][-1]: cks[-1].pop(-1) | |||||
| cks[-1].append(levels[ii][jj]) | |||||
| for ii in cks[-1]: readed[ii] = True | |||||
| for i in range(len(cks)): | |||||
| cks[i] = [sections[j] for j in cks[i][::-1]] | |||||
| print("--------------\n", "\n* ".join(cks[i])) | |||||
| return cks | |||||
| def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): | |||||
| if not sections: return [] | |||||
| if type(sections[0]) == type(""): sections = [(s, "") for s in sections] | |||||
| cks = [""] | |||||
| tk_nums = [0] | |||||
| def add_chunk(t, pos): | |||||
| nonlocal cks, tk_nums, delimiter | |||||
| tnum = num_tokens_from_string(t) | |||||
| if tnum < 8: pos = "" | |||||
| if tk_nums[-1] > chunk_token_num: | |||||
| cks.append(t + pos) | |||||
| tk_nums.append(tnum) | |||||
| else: | |||||
| cks[-1] += t + pos | |||||
| tk_nums[-1] += tnum | |||||
| for sec, pos in sections: | |||||
| s, e = 0, 1 | |||||
| while e < len(sec): | |||||
| if sec[e] in delimiter: | |||||
| add_chunk(sec[s: e+1], pos) | |||||
| s = e + 1 | |||||
| e = s + 1 | |||||
| else: | |||||
| e += 1 | |||||
| if s < e: add_chunk(sec[s: e], pos) | |||||
| return cks | |||||
| import copy | import copy | ||||
| import re | import re | ||||
| import sys | import sys | ||||
| import traceback | |||||
| from functools import partial | from functools import partial | ||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| from io import BytesIO | from io import BytesIO | ||||
| import pandas as pd | import pandas as pd | ||||
| from rag.app import laws, paper, presentation, manual, qa, table, book, resume | |||||
| from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture | |||||
| from api.db import LLMType, ParserType | from api.db import LLMType, ParserType | ||||
| from api.db.services.document_service import DocumentService | from api.db.services.document_service import DocumentService | ||||
| ParserType.QA.value: qa, | ParserType.QA.value: qa, | ||||
| ParserType.TABLE.value: table, | ParserType.TABLE.value: table, | ||||
| ParserType.RESUME.value: resume, | ParserType.RESUME.value: resume, | ||||
| ParserType.PICTURE.value: picture, | |||||
| } | } | ||||
| def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."): | |||||
| def set_progress(task_id, from_page=0, to_page=-1, | |||||
| prog=None, msg="Processing..."): | |||||
| if prog is not None and prog < 0: | |||||
| msg = "[ERROR]"+msg | |||||
| cancel = TaskService.do_cancel(task_id) | cancel = TaskService.do_cancel(task_id) | ||||
| if cancel: | if cancel: | ||||
| msg += " [Canceled]" | msg += " [Canceled]" | ||||
| prog = -1 | prog = -1 | ||||
| if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg | |||||
| if to_page > 0: | |||||
| msg = f"Page({from_page}~{to_page}): " + msg | |||||
| d = {"progress_msg": msg} | d = {"progress_msg": msg} | ||||
| if prog is not None: d["progress"] = prog | |||||
| if prog is not None: | |||||
| d["progress"] = prog | |||||
| try: | try: | ||||
| TaskService.update_progress(task_id, d) | TaskService.update_progress(task_id, d) | ||||
| except Exception as e: | except Exception as e: | ||||
| cron_logger.error("set_progress:({}), {}".format(task_id, str(e))) | cron_logger.error("set_progress:({}), {}".format(task_id, str(e))) | ||||
| if cancel:sys.exit() | |||||
| """ | |||||
| def chuck_doc(name, binary, tenant_id, cvmdl=None): | |||||
| suff = os.path.split(name)[-1].lower().split(".")[-1] | |||||
| if suff.find("pdf") >= 0: | |||||
| return PDF(binary) | |||||
| if suff.find("doc") >= 0: | |||||
| return DOC(binary) | |||||
| if re.match(r"(xlsx|xlsm|xltx|xltm)", suff): | |||||
| return EXC(binary) | |||||
| if suff.find("ppt") >= 0: | |||||
| return PPT(binary) | |||||
| if cvmdl and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$", | |||||
| name.lower()): | |||||
| txt = cvmdl.describe(binary) | |||||
| field = TextChunker.Fields() | |||||
| field.text_chunks = [(txt, binary)] | |||||
| field.table_chunks = [] | |||||
| return field | |||||
| return TextChunker()(binary) | |||||
| """ | |||||
| if cancel: | |||||
| sys.exit() | |||||
| def collect(comm, mod, tm): | def collect(comm, mod, tm): | ||||
| return tasks | return tasks | ||||
| def build(row, cvmdl): | |||||
| def build(row): | |||||
| if row["size"] > DOC_MAXIMUM_SIZE: | if row["size"] > DOC_MAXIMUM_SIZE: | ||||
| set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" % | set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" % | ||||
| (int(DOC_MAXIMUM_SIZE / 1024 / 1024))) | (int(DOC_MAXIMUM_SIZE / 1024 / 1024))) | ||||
| return [] | return [] | ||||
| callback = partial(set_progress, row["id"], row["from_page"], row["to_page"]) | |||||
| callback = partial( | |||||
| set_progress, | |||||
| row["id"], | |||||
| row["from_page"], | |||||
| row["to_page"]) | |||||
| chunker = FACTORY[row["parser_id"].lower()] | chunker = FACTORY[row["parser_id"].lower()] | ||||
| try: | try: | ||||
| cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"])) | |||||
| cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"], | |||||
| callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"]) | |||||
| cron_logger.info( | |||||
| "Chunkking {}/{}".format(row["location"], row["name"])) | |||||
| cks = chunker.chunk(row["name"], binary=MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], | |||||
| to_page=row["to_page"], lang=row["language"], callback=callback, | |||||
| kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"]) | |||||
| except Exception as e: | except Exception as e: | ||||
| if re.search("(No such file|not found)", str(e)): | if re.search("(No such file|not found)", str(e)): | ||||
| callback(-1, "Can not find file <%s>" % row["doc_name"]) | callback(-1, "Can not find file <%s>" % row["doc_name"]) | ||||
| else: | else: | ||||
| callback(-1, f"Internal server error: %s" % str(e).replace("'", "")) | |||||
| callback(-1, f"Internal server error: %s" % | |||||
| str(e).replace("'", "")) | |||||
| traceback.print_exc() | |||||
| cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e))) | |||||
| cron_logger.warn( | |||||
| "Chunkking {}/{}: {}".format(row["location"], row["name"], str(e))) | |||||
| return | return | ||||
| callback(msg="Finished slicing files. Start to embedding the content.") | |||||
| callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks)) | |||||
| docs = [] | docs = [] | ||||
| doc = { | doc = { | ||||
| d = copy.deepcopy(doc) | d = copy.deepcopy(doc) | ||||
| d.update(ck) | d.update(ck) | ||||
| md5 = hashlib.md5() | md5 = hashlib.md5() | ||||
| md5.update((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")) | |||||
| md5.update((ck["content_with_weight"] + | |||||
| str(d["doc_id"])).encode("utf-8")) | |||||
| d["_id"] = md5.hexdigest() | d["_id"] = md5.hexdigest() | ||||
| d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | ||||
| d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | ||||
| def embedding(docs, mdl, parser_config={}): | def embedding(docs, mdl, parser_config={}): | ||||
| tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [d["content_with_weight"] for d in docs] | |||||
| tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [ | |||||
| d["content_with_weight"] for d in docs] | |||||
| tk_count = 0 | tk_count = 0 | ||||
| if len(tts) == len(cnts): | if len(tts) == len(cnts): | ||||
| tts, c = mdl.encode(tts) | tts, c = mdl.encode(tts) | ||||
| cnts, c = mdl.encode(cnts) | cnts, c = mdl.encode(cnts) | ||||
| tk_count += c | tk_count += c | ||||
| title_w = float(parser_config.get("filename_embd_weight", 0.1)) | title_w = float(parser_config.get("filename_embd_weight", 0.1)) | ||||
| vects = (title_w * tts + (1-title_w) * cnts) if len(tts) == len(cnts) else cnts | |||||
| vects = (title_w * tts + (1 - title_w) * | |||||
| cnts) if len(tts) == len(cnts) else cnts | |||||
| assert len(vects) == len(docs) | assert len(vects) == len(docs) | ||||
| for i, d in enumerate(docs): | for i, d in enumerate(docs): | ||||
| def main(comm, mod): | def main(comm, mod): | ||||
| tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm") | |||||
| tm_fnm = os.path.join( | |||||
| get_project_base_directory(), | |||||
| "rag/res", | |||||
| f"{comm}-{mod}.tm") | |||||
| tm = findMaxTm(tm_fnm) | tm = findMaxTm(tm_fnm) | ||||
| rows = collect(comm, mod, tm) | rows = collect(comm, mod, tm) | ||||
| if len(rows) == 0: | if len(rows) == 0: | ||||
| callback = partial(set_progress, r["id"], r["from_page"], r["to_page"]) | callback = partial(set_progress, r["id"], r["from_page"], r["to_page"]) | ||||
| try: | try: | ||||
| embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING) | embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING) | ||||
| cv_mdl = LLMBundle(r["tenant_id"], LLMType.IMAGE2TEXT) | |||||
| # TODO: sequence2text model | |||||
| except Exception as e: | except Exception as e: | ||||
| callback(prog=-1, msg=str(e)) | callback(prog=-1, msg=str(e)) | ||||
| continue | continue | ||||
| st_tm = timer() | |||||
| cks = build(r, cv_mdl) | |||||
| if cks is None:continue | |||||
| cks = build(r) | |||||
| if cks is None: | |||||
| continue | |||||
| if not cks: | if not cks: | ||||
| tmf.write(str(r["update_time"]) + "\n") | tmf.write(str(r["update_time"]) + "\n") | ||||
| callback(1., "No chunk! Done!") | callback(1., "No chunk! Done!") | ||||
| cron_logger.error(str(es_r)) | cron_logger.error(str(es_r)) | ||||
| else: | else: | ||||
| if TaskService.do_cancel(r["id"]): | if TaskService.do_cancel(r["id"]): | ||||
| ELASTICSEARCH.deleteByQuery(Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"])) | |||||
| ELASTICSEARCH.deleteByQuery( | |||||
| Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"])) | |||||
| continue | continue | ||||
| callback(1., "Done!") | callback(1., "Done!") | ||||
| DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) | |||||
| cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks))) | |||||
| DocumentService.increment_chunk_num( | |||||
| r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) | |||||
| cron_logger.info( | |||||
| "Chunk doc({}), token({}), chunks({})".format( | |||||
| r["id"], tk_count, len(cks))) | |||||
| tmf.write(str(r["update_time"]) + "\n") | tmf.write(str(r["update_time"]) + "\n") | ||||
| tmf.close() | tmf.close() |