文章
提取pdf,excel,word表格中的公司名称
只支持pdf、execel、word中原始表格数据,不支持图片中数据解析
依赖安装:
pip install pdfplumber pandas openpyxl python-docximport os
import re
import json
import sys
import pandas as pd
from docx import Document
import pdfplumber
# ======================
# 核心函数:判断是否为企业名称(带自动清洗)
# ======================
def is_company_name(text):
if not isinstance(text, str):
return False
# 清理所有空白字符(包括换行、制表符、多个空格)
clean_text = re.sub(r'\s+', '', text) # 合并所有空白为无
if len(clean_text) < 4 or len(clean_text) > 100:
return False
# 必须以企业关键词结尾(增强容错)
pattern = r'.*(?:有限公司|有限责任公司|股份有限公司|集团.*公司|[^0-9]公司)$'
return bool(re.search(pattern, clean_text))
# ======================
# PDF 提取(已修复换行问题)
# ======================
def extract_from_pdf(pdf_path):
names = set()
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# 1. 尝试表格提取
tables = page.extract_tables()
for table in tables:
if not table:
continue
for row in table:
for cell in row:
if cell and is_company_name(str(cell)):
# 注意:is_company_name 内部已清洗,此处只需去重
clean_name = re.sub(r'\s+', '', str(cell))
if clean_name:
names.add(clean_name)
# 2. 补充:全文本提取(应对表格识别失败)
text = page.extract_text() or ""
lines = text.split('\n')
for line in lines:
if is_company_name(line):
clean_name = re.sub(r'\s+', '', line)
if clean_name:
names.add(clean_name)
except Exception as e:
print(f"⚠️ PDF 解析失败: {e}", file=sys.stderr)
return list(names)
# ======================
# Excel 提取(同样可能有换行,如单元格内 Alt+Enter)
# ======================
def extract_from_excel(excel_path):
names = set()
try:
df_list = pd.read_excel(
excel_path,
sheet_name=None,
header=None,
dtype=str,
engine='openpyxl'
)
for df in df_list.values():
if df.empty:
continue
values = df.values.flatten()
for v in values:
if pd.notna(v) and is_company_name(str(v)):
clean_name = re.sub(r'\s+', '', str(v))
if clean_name:
names.add(clean_name)
except Exception as e:
# 回退到 xlrd(仅当是 .xls 时)
try:
df_list = pd.read_excel(
excel_path,
sheet_name=None,
header=None,
dtype=str,
engine='xlrd'
)
for df in df_list.values():
if df.empty:
continue
values = df.values.flatten()
for v in values:
if pd.notna(v) and is_company_name(str(v)):
clean_name = re.sub(r'\s+', '', str(v))
if clean_name:
names.add(clean_name)
except Exception as e:
print(f"⚠️ Excel 解析失败: {e}", file=sys.stderr)
return list(names)
# ======================
# Word 提取(表格或段落中可能有换行)
# ======================
def extract_from_docx(docx_path):
names = set()
try:
doc = Document(docx_path)
# 表格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text and is_company_name(cell.text):
clean_name = re.sub(r'\s+', '', cell.text)
if clean_name:
names.add(clean_name)
# 段落(可选,根据需求开启)
# for para in doc.paragraphs:
# if is_company_name(para.text):
# clean_name = re.sub(r'\s+', '', para.text)
# if clean_name:
# names.add(clean_name)
except Exception as e:
print(f"⚠️ Word 解析失败: {e}", file=sys.stderr)
return list(names)
# ======================
# 主入口
# ======================
def extract_companies(file_path):
if not os.path.isfile(file_path):
raise FileNotFoundError(f"文件不存在: {file_path}")
ext = os.path.splitext(file_path)[1].lower()
if ext == '.pdf':
return extract_from_pdf(file_path)
elif ext in ['.xlsx', '.xls']:
return extract_from_excel(file_path)
elif ext == '.docx':
return extract_from_docx(file_path)
else:
raise ValueError(f"不支持的文件格式: {ext}")
def main(file_path):
# if len(sys.argv) != 2:
# print("用法: python extract_companies.py <文件路径>", file=sys.stderr)
# sys.exit(1)
#
# file_path = sys.argv[1]
try:
company_names = extract_companies(file_path)
result = {"company_names": sorted(company_names)}
print(json.dumps(result, ensure_ascii=False, indent=2))
except Exception as e:
print(f"❌ 错误: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main("parse02.docx")