package com.luxsan.llm.ai.service.impl;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import com.luxsan.llm.ai.domain.ValidationResult;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@RequiredArgsConstructor
@Service
public class PdfJsonCompareService {
private final Map<String, Pattern> patternCache = new ConcurrentHashMap<>();
private final ObjectMapper objectMapper = new ObjectMapper();
private final Set<String> municipalities = new HashSet<>(); // 直辖市列表
public String readPdfText(MultipartFile file) throws IOException {
try (PDDocument doc = PDDocument.load(file.getInputStream())) {
PDFTextStripper stripper = new PDFTextStripper();
String rawText = stripper.getText(doc);
return rawText.replaceAll("\\s+", " ").trim(); // 统一空白符
}
}
public JsonNode parseJson(String jsonContent) throws Exception {
return this.objectMapper.readTree(jsonContent);
}
public List<ValidationResult> compareContent(String pdfText, JsonNode jsonConfig) {
List<ValidationResult> results = new ArrayList<>();
// 1. 处理 JSON 结构(支持单个对象或数组)
JsonNode dataNode;
if (jsonConfig.isArray() && jsonConfig.size() > 0) {
dataNode = jsonConfig.get(0);
} else if (jsonConfig.isObject()) {
dataNode = jsonConfig;
} else {
results.add(new ValidationResult("ERROR", "JSON格式错误", "期望一个对象或包含对象的数组", "实际格式不匹配", false));
return results;
}
// 2. 字段直接匹配(跳过地址字段)
checkDirectFields(pdfText, dataNode, results);
// 3. 连续字段匹配(地址字段专用)
checkConsecutiveFields(pdfText, dataNode, results);
// 4. 正则检查
performRegexChecks(pdfText, results);
return results;
}
/**
* 检查 JSON 中所有非空字段是否严格存在于 PDF 文本中
*/
private void checkDirectFields(String pdfText, JsonNode jsonConfig, List<ValidationResult> results) {
Iterator<Map.Entry<String, JsonNode>> fields = jsonConfig.fields();
while (fields.hasNext()) {
Map.Entry<String, JsonNode> entry = fields.next();
String fieldName = entry.getKey();
JsonNode valueNode = entry.getValue();
if (valueNode.isValueNode()) {
String expectedValue = valueNode.asText().trim();
if (expectedValue.isEmpty()) continue;
boolean isNumericField = isNumeric(expectedValue);
boolean found = false;
if (isNumericField) {
Pattern pattern = Pattern.compile("\\b" + Pattern.quote(expectedValue) + "\\b");
found = pattern.matcher(pdfText).find();
} else {
// 非数字字段严格匹配(前后有非中文字符或边界)
Pattern pattern = Pattern.compile("[^\\u4e00-\\u9fa5]" + Pattern.quote(expectedValue) + "[^\\u4e00-\\u9fa5]");
found = pattern.matcher(" " + pdfText + " ").find();
}
results.add(new ValidationResult(
"FIELD",
fieldName,
expectedValue,
found ? "Found" : "Not Found",
found
));
}
}
}
/**
* 执行正则表达式检查
*/
private void performRegexChecks(String pdfText, List<ValidationResult> results) {
List<Term> terms = HanLP.segment(pdfText);
try (InputStream is = getClass().getClassLoader().getResourceAsStream("validation_rules.json")) {
JsonNode config = objectMapper.readTree(is);
JsonNode regexChecks = config.path("regexChecks");
if (regexChecks.isMissingNode()) return;
Iterator<Map.Entry<String, JsonNode>> regexes = regexChecks.fields();
while (regexes.hasNext()) {
Map.Entry<String, JsonNode> entry = regexes.next();
String checkName = entry.getKey();
String regexPattern = entry.getValue().asText();
Pattern pattern = getCachedPattern(regexPattern);
Matcher matcher = pattern.matcher(pdfText);
boolean found = matcher.find();
results.add(new ValidationResult(
"REGEX",
checkName,
regexPattern,
found ? "Matched" : "Not Matched",
found
));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 获取或创建缓存中的 Pattern 对象
*/
private Pattern getCachedPattern(String regex) {
return patternCache.computeIfAbsent(regex, Pattern::compile);
}
/**
* 修复地址字段
*/
private void checkConsecutiveFields(String pdfText, JsonNode jsonConfig, List<ValidationResult> results) {
List<Term> terms = HanLP.segment(pdfText);
List<String> addressParts = new ArrayList<>();
String lastPart = null;
for (Term term : terms) {
String word = term.word;
String nature = term.nature.toString();
if (word.matches("\\d{5,7}")) {
addressParts.add(word);
} else if (nature.startsWith("ns")) {
if (lastPart != null && municipalities.contains(lastPart) && municipalities.contains(word)) {
addressParts.set(addressParts.size() - 1, word); // 合并直辖市
} else {
addressParts.add(word);
}
lastPart = word;
}
}
if (addressParts.size() >= 2) {
String city = addressParts.get(0);
String state = addressParts.get(1);
String zip = addressParts.size() > 2 ? addressParts.get(2) : "";
// 强制直辖市 city == state
if (municipalities.contains(city)) {
addResult(jsonConfig, results, "SHIPTOCITY", city);
addResult(jsonConfig, results, "SHIPTOSTATE", city);
} else if (municipalities.contains(state)) {
addResult(jsonConfig, results, "SHIPTOCITY", state);
addResult(jsonConfig, results, "SHIPTOSTATE", state);
} else {
addResult(jsonConfig, results, "SHIPTOCITY", city);
addResult(jsonConfig, results, "SHIPTOSTATE", state);
}
addResult(jsonConfig, results, "SHIPTOZIP", zip);
}
}
/**
* 辅助方法:添加单个字段结果(防止重复)
*/
private void addResult(JsonNode jsonConfig, List<ValidationResult> results, String fieldName, String actualValue) {
if (jsonConfig.has(fieldName)) {
JsonNode valueNode = jsonConfig.get(fieldName);
if (valueNode.isValueNode()) {
String expectedValue = valueNode.asText().trim();
// 检查是否已存在该字段
boolean alreadyExists = results.stream()
.anyMatch(r -> r.getFieldName().equals(fieldName));
if (!alreadyExists) {
results.add(new ValidationResult(
"FIELD",
fieldName,
expectedValue,
actualValue,
expectedValue.equals(actualValue)
));
}
}
}
}
/**
* 判断字符串是否为数字(整数、小数、科学计数法等)
*/
private boolean isNumeric(String str) {
try {
Double.parseDouble(str);
return true;
} catch (NumberFormatException e) {
return false;
}
}
}
还是上海上海201800 这个那个三个就是false 深圳广东518000 这个也是 帮我解决一下 然后代码哪里不需要的就去掉
最新发布