目标:做一个文字匹配解析例子(这里这个例子是通过职位 title 得到真正级别 band,从而获取上下级关系)。
功能:
1)Json 格式配置文件读取;
2)通过配置文件灵活控制解析功能,如:filter 匹配,bypass 不匹配,filter_reg 正则匹配,bypass_reg 正则不匹配。
e.g. LangRecogUtils
public class LangRecogUtils {
static final Logger logger = LoggerFactory.getLogger(LangRecogUtils.class);
public static int getBand(String title) {
final String PATH = "band.dic";
final int DEFAULT_BAND = 0; // default: no band
int band = DEFAULT_BAND;
String allLines = "";
boolean continueScanFlag = true;
boolean completeFlag = false;
if (title == null || title.equalsIgnoreCase("NULL")) {
return band;
}
String titleInLowerCase = title.toLowerCase();
allLines = build(PATH);
ObjectMapper mapper = new ObjectMapper();
JsonNode rootNode;
try {
rootNode = mapper.readValue(allLines.getBytes(), 0, allLines.getBytes().length, JsonNode.class);
if (rootNode == null) {
return band;
}
Iterator<JsonNode> jsonItr_1 = rootNode.getElements();
logger.debug("LangRecogUtils - rootNode Size: {}", rootNode.size());
while (jsonItr_1.hasNext()) {
JsonNode subNode = jsonItr_1.next();
continueScanFlag = true;
// get the band from the dictionary
int tempBand = DEFAULT_BAND;
if (subNode.get("band") != null) {
tempBand = subNode.get("band").getIntValue();
logger.debug("LangRecogUtils - temp band: {}", tempBand);
} else {
logger.error("The \"band\" session is madatory.");
}
// get the bypass list from the dictionary
if (subNode.get("bypass_list") != null) {
Iterator<JsonNode> jsonItr_3 = subNode.get("bypass_list").getElements();
while (continueScanFlag && jsonItr_3.hasNext()) {
JsonNode filterNode = jsonItr_3.next();
if (filterNode.get("bypass") != null) {
String bypass = filterNode.get("bypass").getTextValue();
logger.debug("LangRecogUtils - bypass: {}", bypass);
if (titleInLowerCase.contains(bypass)) {
band = DEFAULT_BAND;
continueScanFlag = false; // bypass coming scanning until next band session
}
}
if (filterNode.get("bypass_reg") != null) {
String bypassReg = filterNode.get("bypass_reg").getTextValue();
logger.debug("LangRecogUtils - bypass reg: {}", bypassReg);
Pattern bypassPattern = Pattern.compile(bypassReg);
Matcher bypassMatcher = bypassPattern.matcher(titleInLowerCase);
if (bypassMatcher.matches()) {
band = DEFAULT_BAND;
continueScanFlag = false; // bypass coming scanning until next band session
}
}
}
}
// get the filter list from the dictionary
if (subNode.get("filter_list") != null) {
Iterator<JsonNode> jsonItr_2 = subNode.get("filter_list").getElements();
while (continueScanFlag && jsonItr_2.hasNext()) {
JsonNode filterNode = jsonItr_2.next();
if (filterNode.get("filter") != null) {
String filter = filterNode.get("filter").getTextValue();
logger.debug("LangRecogUtils - filter: {}", filter);
if (titleInLowerCase.contains(filter)) {
band = tempBand;
completeFlag = true;
}
}
if (filterNode.get("filter_reg") != null) {
String filterReg = filterNode.get("filter_reg").getTextValue();
logger.debug("LangRecogUtils - filter reg: {}", filterReg);
Pattern filterPattern = Pattern.compile(filterReg);
Matcher filterMatcher = filterPattern.matcher(titleInLowerCase);
if (filterMatcher.matches()) {
band = tempBand;
completeFlag = true;
}
}
}
}
if (completeFlag) {
return band;
}
}
} catch (JsonParseException e) {
logger.error(e.getMessage(), e);
} catch (JsonMappingException e) {
logger.error(e.getMessage(), e);
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
return band;
}
public static String build(String dictName) {
BufferedReader reader = null;
int i = 0;
String allLines = "";
try {
reader = new BufferedReader( new InputStreamReader(Util.getInputStream(dictName), "utf-8"));
String line = reader.readLine();
while (line != null && !line.trim().equals("")) {
i++;
allLines += line;
line = reader.readLine();
}
logger.debug("LangRecogUtils - total read lines: {}", i);
} catch (Exception e) {
logger.error(e.getMessage(), e);
} finally {
if (reader != null) {
try {
reader.close();
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
}
return allLines.toLowerCase();
}
public static void main(String[] args) {
// String testStr = "Human Resources Manager";
// System.out.println("LangRecogUtils - band:" + getBand(testStr));
String[] testStrs = {
"Executive Officer & Chief of Staff for the CIO",
"Senior Program Manager-Public Key Infrastructure (PKI)",
"Deputy Director of Intelligence (Deployment)",
"Human Resources Manager",
"Overt Debriefing Team Chief",
"lead Security Contractor",
"Assistant Project Manager",
"Senior Watch Officer",
"Naval Attaché",
"Operations Officer",
"Executive Admin Assistant - E4",
"Engineer Intern"
};
/*
* Expectation:
* 5,10,10,15,20,20,25,25,30,30,35,35
*/
for (int i = 0; i < testStrs.length; i++) {
System.out.println("LangRecogUtils - Str:" + (i+1) + ", band:" + getBand(testStrs[i]));
}
}
}
配置文件,band.dic:
[
{
"band": 5,
"filter_list": [
{
"filter": "chief"
}
],
"bypass_list": [
{
"bypass": "team chief"
},
{
"bypass": "Colonel"
}
]
},
{
"band": 10,
"filter_list": [
{
"filter": "director"
},
{
"filter_reg": "senior(.*?)manager(.*)"
},
{
"filter_reg": "senior(.*?)strategist(.*)"
}
]
},
{
"band": 15,
"filter_list": [
{
"filter": "manager"
},
{
"filter": "strategist"
},
{
"filter": "Subject Matter Expert"
},
{
"filter": "Consultant"
}
],
"bypass_list": [
{
"bypass_reg": "senior(.*?)manager(.*)"
},
{
"bypass_reg": "senior(.*?)strategist(.*)"
},
{
"bypass_reg": "Assistant(.*?)manager(.*)"
}
]
},
{
"band": 20,
"filter_list": [
{
"filter": "instructor"
},
{
"filter": "Lead"
},
{
"filter": "Team Chief"
},
{
"filter": "Superintendent"
},
{
"filter": "Supervisor"
},
{
"filter": "Specialist"
}
]
},
{
"band": 25,
"filter_list": [
{
"filter_reg": "Assistant(.*?)manager(.*)"
},
{
"filter_reg": "senior(.*)"
}
],
"bypass_list": [
{
"bypass_reg": "senior(.*?)manager(.*)"
},
{
"bypass_reg": "senior(.*?)strategist(.*)"
}
]
},
{
"band": 35,
"filter_list": [
{
"filter": "Administrative Assistant"
},
{
"filter": "Admin Assistant"
},
{
"filter": "Coordinator"
},
{
"filter": "Contractor"
},
{
"filter": "Internship"
},
{
"filter": "Intern"
},
{
"filter": "Student"
},
{
"filter": "Trainee"
},
{
"filter": "Security Guard"
},
{
"filter": "Part-Time"
},
{
"filter": "Volunteer"
}
]
},
{
"band": 30,
"filter_list": [
{
"filter_reg": "(.*)"
}
]
}
]