import pdfplumber
import pandas as pd
import os
import re
def extract_materials_from_pdf(pdf_path, filename):
"""
从PDF文件中提取材料信息 - 专门针对ISO图纸格式
"""
materials = []
try:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
materials.extend(parse_iso_materials(text, filename))
return materials
except Exception as e:
print(f"处理文件 {filename} 时出错: {e}")
return []
def parse_iso_materials(text, filename):
"""
专门解析ISO图纸的材料表格
"""
materials = []
lines = text.split('\n')
current_category = ""
pipeline_number = get_pipeline_number(text)
# 查找材料表格的开始
table_start = -1
for i, line in enumerate(lines):
if re.search(r'PT\s*NO|COMPONENT\s*DESCRIPTION|N\.S\.\(MM\)|IDENT\s*CODE|QTY', line, re.IGNORECASE):
table_start = i
break
if table_start == -1:
print(f" 在文件 {filename} 中未找到材料表格")
return materials
print(f" 找到材料表格,开始位置: {table_start}")
# 从表格开始位置解析
i = table_start
while i < len(lines):
line = lines[i].strip()
# 检测类别行
if is_category_line(line):
current_category = get_category(line)
print(f" 发现类别: {current_category}")
i += 1
continue
# 跳过空行
if not line:
i += 1
continue
# 检查是否是切割长度表或其他非材料行
if re.search(r'CUT LENGTH TABLE|SPOOL N|ɍɁɅȺ|LENGTH\(MM\)|ȾɅɂɇȺ\(MM\)', line, re.IGNORECASE):
print(f" 遇到切割长度表,停止解析")
break
# 检查是否是图纸信息行
if re.search(r'ISSUED FOR|N-PLANT NORTH|E \d+|N \d+|EL\. \+\d+', line, re.IGNORECASE):
i += 1
continue
# 尝试解析材料行
material = parse_material_line(lines, i, current_category)
if material and material['材料代码'] and material['材料介绍']:
material['单线图文件名'] = filename
material['管线号'] = pipeline_number
materials.append(material)
print(f" 提取材料: {material['材料代码']} - 规格: {material['材料规格']} - 数量: {material['数量']}")
# 跳过已处理的行
i += material.get('lines_processed', 1)
else:
i += 1
# 如果遇到明显的表格结束标记,提前退出
if i < len(lines) and re.search(r'CUT LENGTH TABLE|CONT\. ON|CONT\. FROM', lines[i], re.IGNORECASE):
print(f" 遇到表格结束标记,停止解析")
break
return materials
def parse_material_line(lines, start_index, category):
"""
解析材料行,特别处理管道支架和螺栓
"""
line = lines[start_index].strip()
# 检查是否是以数字开头的行(材料行)
if not re.match(r'^\d+\s', line):
return None
# 提取项目编号
item_no_match = re.match(r'^(\d+)\s+', line)
if not item_no_match:
return None
item_no = item_no_match.group(1)
remaining_line = line[len(item_no):].strip()
# 特殊处理管道支架
if category == "PIPE SUPPORTS":
return parse_pipe_support_line(lines, start_index, category)
# 特殊处理螺栓
if category == "BOLTS":
return parse_bolt_line(lines, start_index, category)
# 特殊处理垫片(可能有多行描述)
if category == "GASKETS":
return parse_gasket_line(lines, start_index, category)
# 常规材料解析
# 尝试提取数量(通常是行尾的数字,可能带有M)
qty_match = re.search(r'(\d+\.?\d*\s*M?)$', remaining_line)
if not qty_match:
# 尝试从下一行获取数量
if start_index + 1 < len(lines):
next_line = lines[start_index + 1].strip()
qty_match = re.search(r'(\d+\.?\d*\s*M?)$', next_line)
if not qty_match:
return None
quantity = qty_match.group(1)
# 从行中移除数量部分
remaining_line = re.sub(r'\s*' + re.escape(quantity) + r'$', '', remaining_line)
# 尝试提取材料代码(通常是大写字母、数字、下划线和连字符的组合)
code_match = re.search(r'([A-Z0-9][A-Z0-9_\-]{5,})\s*$', remaining_line)
if not code_match:
# 尝试从下一行获取代码
if start_index + 1 < len(lines):
next_line = lines[start_index + 1].strip()
code_match = re.search(r'([A-Z0-9][A-Z0-9_\-]{5,})\s*$', next_line)
if not code_match:
# 对于没有明确代码的材料,检查是否有有效的描述
if not re.search(r'[A-Za-z]', remaining_line):
return None
# 使用描述的一部分作为代码
ident_code = f"MAT-{item_no}-{re.sub(r'[^A-Z0-9]', '', remaining_line.upper())[:10]}"
else:
ident_code = code_match.group(1)
# 从行中移除代码部分
remaining_line = re.sub(r'\s*' + re.escape(ident_code) + r'\s*$', '', remaining_line)
# 尝试提取规格(通常是数字或数字x数字格式)
size_match = re.search(r'(\d+(?:\s?[x×]\s?\d+)?)\s*(?:MM|mm|")?$', remaining_line)
if size_match:
size = size_match.group(1)
# 从行中移除规格部分
remaining_line = re.sub(r'\s*' + re.escape(size) + r'\s*(?:MM|mm|")?\s*$', '', remaining_line)
else:
size = ""
# 剩余部分就是描述
description = remaining_line.strip()
# 检查描述是否有效
if not re.search(r'[A-Za-z]', description):
return None
# 清理描述中的多余空格
description = re.sub(r'\s+', ' ', description).strip()
return {
'材料代码': ident_code,
'材料规格': size,
'材料介绍': f"{category}: {description}" if category else description,
'数量': quantity,
'lines_processed': 1
}
def parse_pipe_support_line(lines, start_index, category):
"""
专门解析管道支架行
"""
# 管道支架的格式通常是: 项目编号 描述 规格 数量
# 例如: "6 J(S1)-8"-CS1-100 200 4"
line = lines[start_index].strip()
# 提取项目编号
item_no_match = re.match(r'^(\d+)\s+', line)
if not item_no_match:
return None
item_no = item_no_match.group(1)
remaining_line = line[len(item_no):].strip()
# 尝试提取数量(通常是行尾的数字)
qty_match = re.search(r'(\d+)$', remaining_line)
if not qty_match:
return None
quantity = qty_match.group(1)
# 从行中移除数量部分
remaining_line = re.sub(r'\s*' + re.escape(quantity) + r'$', '', remaining_line)
# 尝试提取规格(通常是数字)
size_match = re.search(r'(\d+)\s*(?:MM|mm|")?$', remaining_line)
if size_match:
size = size_match.group(1)
# 从行中移除规格部分
remaining_line = re.sub(r'\s*' + re.escape(size) + r'\s*(?:MM|mm|")?\s*$', '', remaining_line)
else:
size = ""
# 剩余部分就是描述
description = remaining_line.strip()
# 检查描述是否有效
if not re.search(r'[A-Za-z]', description):
return None
# 生成一个代码(使用描述的前几个字符和项目编号)
clean_desc = re.sub(r'[^A-Z0-9]', '', description.upper())
ident_code = f"PS-{item_no}-{clean_desc[:10]}" if clean_desc else f"PS-{item_no}"
return {
'材料代码': ident_code,
'材料规格': size,
'材料介绍': f"{category}: {description}",
'数量': quantity,
'lines_processed': 1
}
def parse_bolt_line(lines, start_index, category):
"""
专门解析螺栓行
"""
# 螺栓的格式通常是多行的,例如:
# "15 95 mm BOLTS/NUTS,-,A320 Gr.L7/A194 GR.7 S3,FT"
# "S.BOLT/2 HHN,ASME B18.31.2"
# "5/8 PLLS60NZZ-95 4"
# 首先尝试提取项目编号
line = lines[start_index].strip()
item_no_match = re.match(r'^(\d+)\s+', line)
if not item_no_match:
return None
item_no = item_no_match.group(1)
remaining_line = line[len(item_no):].strip()
# 合并多行描述
description_lines = [remaining_line]
lines_processed = 1
# 检查下一行是否也是描述的一部分
while start_index + lines_processed < len(lines):
next_line = lines[start_index + lines_processed].strip()
# 如果下一行以数字开头(可能是下一个项目),则停止
if re.match(r'^\d+\s', next_line):
break
# 如果下一行包含明显的数量或代码模式,则停止
if re.search(r'\d+$|PLLS\d+', next_line):
break
description_lines.append(next_line)
lines_processed += 1
# 现在处理包含数量、规格和代码的行
if start_index + lines_processed >= len(lines):
return None
data_line = lines[start_index + lines_processed].strip()
lines_processed += 1
# 尝试提取数量
qty_match = re.search(r'(\d+)$', data_line)
if not qty_match:
return None
quantity = qty_match.group(1)
# 从行中移除数量部分
data_line = re.sub(r'\s*' + re.escape(quantity) + r'$', '', data_line)
# 尝试提取代码
code_match = re.search(r'([A-Z0-9][A-Z0-9_\-]{5,})\s*$', data_line)
if not code_match:
# 如果没有明确代码,生成一个
ident_code = f"BOLT-{item_no}"
else:
ident_code = code_match.group(1)
# 从行中移除代码部分
data_line = re.sub(r'\s*' + re.escape(ident_code) + r'\s*$', '', data_line)
# 剩余部分可能是规格
size = data_line.strip()
# 合并描述
description = ' '.join(description_lines).strip()
# 检查描述是否有效
if not re.search(r'[A-Za-z]', description):
return None
return {
'材料代码': ident_code,
'材料规格': size,
'材料介绍': f"{category}: {description}",
'数量': quantity,
'lines_processed': lines_processed
}
def parse_gasket_line(lines, start_index, category):
"""
专门解析垫片行(可能有多行描述)
"""
line = lines[start_index].strip()
# 提取项目编号
item_no_match = re.match(r'^(\d+)\s+', line)
if not item_no_match:
return None
item_no = item_no_match.group(1)
remaining_line = line[len(item_no):].strip()
# 尝试提取数量
qty_match = re.search(r'(\d+)$', remaining_line)
if not qty_match:
return None
quantity = qty_match.group(1)
# 从行中移除数量部分
remaining_line = re.sub(r'\s*' + re.escape(quantity) + r'$', '', remaining_line)
# 尝试提取代码
code_match = re.search(r'([A-Z0-9][A-Z0-9_\-]{5,})\s*$', remaining_line)
if not code_match:
return None
ident_code = code_match.group(1)
# 从行中移除代码部分
remaining_line = re.sub(r'\s*' + re.escape(ident_code) + r'\s*$', '', remaining_line)
# 尝试提取规格
size_match = re.search(r'(\d+)\s*(?:MM|mm|")?$', remaining_line)
if size_match:
size = size_match.group(1)
# 从行中移除规格部分
remaining_line = re.sub(r'\s*' + re.escape(size) + r'\s*(?:MM|mm|")?\s*$', '', remaining_line)
else:
size = ""
# 剩余部分就是描述
description = remaining_line.strip()
# 检查下一行是否也是描述的一部分
if start_index + 1 < len(lines):
next_line = lines[start_index + 1].strip()
if not re.match(r'^\d+\s', next_line) and not is_category_line(next_line):
description += " " + next_line
# 检查描述是否有效
if not re.search(r'[A-Za-z]', description):
return None
return {
'材料代码': ident_code,
'材料规格': size,
'材料介绍': f"{category}: {description}",
'数量': quantity,
'lines_processed': 1
}
def is_category_line(line):
"""
判断是否是类别行
"""
categories = ['PIPES', 'FITTINGS', 'FLANGES', 'GASKETS', 'BOLTS',
'VALVES', 'INSTRUMENTS', 'PIPE SUPPORTS', 'VALVES / IN-LINE ITEMS']
return any(cat in line.upper() for cat in categories)
def get_category(line):
"""
从行中提取类别
"""
categories = ['PIPES', 'FITTINGS', 'FLANGES', 'GASKETS', 'BOLTS',
'VALVES', 'INSTRUMENTS', 'PIPE SUPPORTS', 'VALVES / IN-LINE ITEMS']
for cat in categories:
if cat in line.upper():
return cat
return ""
def get_pipeline_number(text):
"""
从文本中提取管线号
"""
# 首先尝试查找明确的管线号模式
patterns = [
r'9170-\d{4}-[A-Z0-9]+-[A-Z]{2,4}',
r'9170-\d{4}-[A-Z0-9]+',
r'CA911[EF]',
r'91-E-\d+-T\d+',
]
for pattern in patterns:
matches = re.findall(pattern, text)
if matches:
# 返回最长的匹配项(通常更完整)
return max(matches, key=len)
# 如果找不到明确模式,尝试从文件名提取
filename_match = re.search(r'9170[^\s]*-(?:PRP|LN)', text)
if filename_match:
return filename_match.group(0)
return "未识别"
def batch_process_pdfs(folder_path):
"""
批量处理文件夹中的所有PDF文件
"""
all_materials = []
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
if not pdf_files:
print("文件夹中没有找到PDF文件")
return None
print(f"找到 {len(pdf_files)} 个PDF文件,开始处理...")
for pdf_file in pdf_files:
pdf_path = os.path.join(folder_path, pdf_file)
print(f"\n处理文件: {pdf_file}")
materials = extract_materials_from_pdf(pdf_path, pdf_file)
if materials:
print(f" 提取到 {len(materials)} 条材料记录")
for j, material in enumerate(materials[:min(3, len(materials))]):
print(f" {j+1}. {material['材料代码']}: {material['材料介绍']} - {material['数量']}")
else:
print(f" 未提取到材料记录")
all_materials.extend(materials)
return all_materials
def create_excel(materials_data, output_path):
"""
创建Excel文件
"""
if not materials_data:
print("没有提取到材料数据")
return False
df = pd.DataFrame(materials_data)
# 重新排列列顺序
columns = ['单线图文件名', '材料代码', '材料规格', '材料介绍', '数量', '管线号']
df = df[columns]
# 去重(保留第一条记录)
df = df.drop_duplicates(subset=['单线图文件名', '材料代码', '数量'], keep='first')
# 确保所有材料介绍都有类别前缀
def ensure_category_prefix(description):
if not re.match(r'^[A-Z]+:', description):
# 如果没有类别前缀,尝试从描述中推断类别
if 'PIPE' in description.upper() and 'SUPPORT' not in description.upper():
return f"PIPES: {description}"
elif 'SUPPORT' in description.upper() or description.startswith('J('):
return f"PIPE SUPPORTS: {description}"
elif 'FITTING' in description.upper() or 'ELBOW' in description.upper() or 'TEE' in description.upper():
return f"FITTINGS: {description}"
elif 'FLANGE' in description.upper():
return f"FLANGES: {description}"
elif 'GASKET' in description.upper():
return f"GASKETS: {description}"
elif 'BOLT' in description.upper() or 'NUT' in description.upper():
return f"BOLTS: {description}"
elif 'VALVE' in description.upper():
return f"VALVES: {description}"
elif 'INSTRUMENT' in description.upper():
return f"INSTRUMENTS: {description}"
else:
return description
return description
df['材料介绍'] = df['材料介绍'].apply(ensure_category_prefix)
df.to_excel(output_path, index=False, engine='openpyxl')
return True
def main():
# 设置文件夹路径
pdf_folder = r"C:\Users\10196\Desktop\PDF文件夹"
output_excel = r"C:\Users\10196\Desktop\材料清单汇总_精确版.xlsx"
print("开始处理PDF文件...")
materials_data = batch_process_pdfs(pdf_folder)
if materials_data:
if create_excel(materials_data, output_excel):
print(f"\n✅ Excel文件已生成: {output_excel}")
print(f"✅ 共提取 {len(materials_data)} 条材料记录")
# 显示统计信息
files = set(m['单线图文件名'] for m in materials_data)
print(f"✅ 处理了 {len(files)} 个文件")
# 显示材料类别统计
categories = {}
for m in materials_data:
cat = m['材料介绍'].split(':')[0] if ':' in m['材料介绍'] else '其他'
categories[cat] = categories.get(cat, 0) + 1
print("✅ 材料类别统计:")
for cat, count in categories.items():
print(f" {cat}: {count}条")
else:
print("❌ 生成Excel文件失败")
else:
print("❌ 没有提取到任何材料数据")
if __name__ == "__main__":
main()这个代码提取pdf里面的支架资料老是提取不全该怎么办
最新发布