大数据

提取pdf,excel,word表格中的公司名称

只支持pdf、execel、word中原始表格数据,不支持图片中数据解析

依赖安装:

pip install pdfplumber pandas openpyxl python-docx
import os
import re
import json
import sys
import pandas as pd
from docx import Document
import pdfplumber


# ======================
# 核心函数:判断是否为企业名称(带自动清洗)
# ======================
def is_company_name(text):
    if not isinstance(text, str):
        return False
    # 清理所有空白字符(包括换行、制表符、多个空格)
    clean_text = re.sub(r'\s+', '', text)  # 合并所有空白为无
    if len(clean_text) < 4 or len(clean_text) > 100:
        return False
    # 必须以企业关键词结尾(增强容错)
    pattern = r'.*(?:有限公司|有限责任公司|股份有限公司|集团.*公司|[^0-9]公司)$'
    return bool(re.search(pattern, clean_text))


# ======================
# PDF 提取(已修复换行问题)
# ======================
def extract_from_pdf(pdf_path):
    names = set()
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # 1. 尝试表格提取
                tables = page.extract_tables()
                for table in tables:
                    if not table:
                        continue
                    for row in table:
                        for cell in row:
                            if cell and is_company_name(str(cell)):
                                # 注意:is_company_name 内部已清洗,此处只需去重
                                clean_name = re.sub(r'\s+', '', str(cell))
                                if clean_name:
                                    names.add(clean_name)
                # 2. 补充:全文本提取(应对表格识别失败)
                text = page.extract_text() or ""
                lines = text.split('\n')
                for line in lines:
                    if is_company_name(line):
                        clean_name = re.sub(r'\s+', '', line)
                        if clean_name:
                            names.add(clean_name)
    except Exception as e:
        print(f"⚠️ PDF 解析失败: {e}", file=sys.stderr)
    return list(names)


# ======================
# Excel 提取(同样可能有换行,如单元格内 Alt+Enter)
# ======================
def extract_from_excel(excel_path):
    names = set()
    try:
        df_list = pd.read_excel(
            excel_path,
            sheet_name=None,
            header=None,
            dtype=str,
            engine='openpyxl'
        )
        for df in df_list.values():
            if df.empty:
                continue
            values = df.values.flatten()
            for v in values:
                if pd.notna(v) and is_company_name(str(v)):
                    clean_name = re.sub(r'\s+', '', str(v))
                    if clean_name:
                        names.add(clean_name)
    except Exception as e:
        # 回退到 xlrd(仅当是 .xls 时)
        try:
            df_list = pd.read_excel(
                excel_path,
                sheet_name=None,
                header=None,
                dtype=str,
                engine='xlrd'
            )
            for df in df_list.values():
                if df.empty:
                    continue
                values = df.values.flatten()
                for v in values:
                    if pd.notna(v) and is_company_name(str(v)):
                        clean_name = re.sub(r'\s+', '', str(v))
                        if clean_name:
                            names.add(clean_name)
        except Exception as e:
            print(f"⚠️ Excel 解析失败: {e}", file=sys.stderr)
    return list(names)


# ======================
# Word 提取(表格或段落中可能有换行)
# ======================
def extract_from_docx(docx_path):
    names = set()
    try:
        doc = Document(docx_path)
        # 表格
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text and is_company_name(cell.text):
                        clean_name = re.sub(r'\s+', '', cell.text)
                        if clean_name:
                            names.add(clean_name)
        # 段落(可选,根据需求开启)
        # for para in doc.paragraphs:
        #     if is_company_name(para.text):
        #         clean_name = re.sub(r'\s+', '', para.text)
        #         if clean_name:
        #             names.add(clean_name)
    except Exception as e:
        print(f"⚠️ Word 解析失败: {e}", file=sys.stderr)
    return list(names)


# ======================
# 主入口
# ======================
def extract_companies(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"文件不存在: {file_path}")

    ext = os.path.splitext(file_path)[1].lower()

    if ext == '.pdf':
        return extract_from_pdf(file_path)
    elif ext in ['.xlsx', '.xls']:
        return extract_from_excel(file_path)
    elif ext == '.docx':
        return extract_from_docx(file_path)
    else:
        raise ValueError(f"不支持的文件格式: {ext}")


def main(file_path):
    # if len(sys.argv) != 2:
    #     print("用法: python extract_companies.py <文件路径>", file=sys.stderr)
    #     sys.exit(1)
    #
    # file_path = sys.argv[1]
    try:
        company_names = extract_companies(file_path)
        result = {"company_names": sorted(company_names)}
        print(json.dumps(result, ensure_ascii=False, indent=2))
    except Exception as e:
        print(f"❌ 错误: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main("parse02.docx")