实现类似淘宝智能识别联系人,地址,电话,联系人
最近在上班的时候遇到了一个需求,搜了很多发现好像要么需要调用第三方接口(付费),然后要么就需要付费的,总之就是要钱.之后就突然想到结合DFA算法能实现智能匹配,可能精确度略有差异.
大概介绍一下思路,利用DFA算法来实现对地址的过滤,然后再这里也可以使用类似思想,先过滤对应的省市区等地址,然后再去过滤人名,可能精度不是很高,如果想完善的可以自己试试.
过滤代码
/**
* @Author: lemon
* @Description: 地址过滤
*/
@Component
public class SensitivewordFilter implements InitializingBean {
public static Map sensitiveWordMap = null;
public static int minMatchTYpe = 1; //最小匹配规则
public static int maxMatchType = 2; //最大匹配规则
/**
* 判断文字是否包含词库中的词字符
*/
public boolean isContaintSensitiveWord(String txt,int matchType){
boolean flag = false;
for(int i = 0 ; i < txt.length() ; i++){
//判断是否包含字符
int matchFlag = this.CheckSensitiveWord(txt, i, matchType);
//大于0存在,返回true
if(matchFlag > 0){
flag = true;
}
}
return flag;
}
/**
* 获取文字中的对应词库中的词
*/
public Set<String> getSensitiveWord(String txt , int matchType){
Set<String> sensitiveWordList = new HashSet<String>();
for(int i = 0 ; i < txt.length() ; i++){
//判断是否包含字符
int length = CheckSensitiveWord(txt, i, matchType);
//存在,加入list中
if(length > 0){
sensitiveWordList.add(txt.substring(i, i+length));
//减1的原因,是因为for会自增
i = i + length - 1;
}
}
return sensitiveWordList;
}
/**
* 构造函数,初始化省市区词库
*/
@Override
public void afterPropertiesSet() throws Exception {
sensitiveWordMap = new SensitiveWordInit().initKeyWord();
}
初始化代码
/**
* 利用DFA算法,词库初始化
* @Author : lemon
*/
public class SensitiveWordInit {
private String ENCODING = "UTF-8";
public static HashMap sensitiveWordMap;
public SensitiveWordInit(){
super();
}
public Map initKeyWord(){
try {
Set<String> keyWordSet = readSensitiveWordFile();
addSensitiveWordToHashMap(keyWordSet);
} catch (Exception e) {
e.printStackTrace();
}
return sensitiveWordMap;
}
private void addSensitiveWordToHashMap(Set<String> keyWordSet) {
sensitiveWordMap = new HashMap(keyWordSet.size());
String key = null;
Map nowMap = null;
Map<String, String> newWorMap = null;
Iterator<String> iterator = keyWordSet.iterator();
while(iterator.hasNext()){
key = iterator.next();
nowMap = sensitiveWordMap;
for(int i = 0 ; i < key.length() ; i++){
char keyChar = key.charAt(i);
Object wordMap = nowMap.get(keyChar);
if(wordMap != null){
nowMap = (Map) wordMap;
}
else{
newWorMap = new HashMap<String,String>();
newWorMap.put("isEnd", "0");
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}
if(i == key.length() - 1){
nowMap.put("isEnd", "1");
}
}
}
}
@SuppressWarnings("resource")
private Set<String> readSensitiveWordFile() throws Exception{
Set<String> set = null;
ClassPathResource resource = new ClassPathResource("static/省市区.txt");
// 获取文件
File file = resource.getFile();
InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING);
try {
if(file.isFile() && file.exists()){
set = new HashSet<String>();
BufferedReader bufferedReader = new BufferedReader(read);
String txt = null;
while((txt = bufferedReader.readLine()) != null){
set.add(txt);
}
}
else{
throw new Exception("");
}
} catch (Exception e) {
throw e;
}finally{
read.close();
}
return set;
}
}
这里是对应的resource下的词库,结合自己的做修改
.
我的词库整理的可能不全,大家可以自己完善
要的可以私信我,发给你
实现类
@Override
public ExpressAdress getIntelligentIdentification(String contactInformation) {
ExpressAdress expressAdress = new ExpressAdress();
contactInformation = contactInformation.replaceAll("[\\p{Punct}\\pP]","");
//*识别电话号码*/
Pattern pattern = Pattern.compile("([1][3-9][\\d]{9})|(0\\d{2,4}-\\d{7,8})");
Matcher matcher = pattern.matcher(contactInformation);
StringBuffer ph = new StringBuffer(64);
while (matcher.find()) {
ph.append(matcher.group());
}
String phone = ph.toString();
contactInformation = contactInformation.replace(phone, "");
expressAdress.setPhone(phone);
Set<String> filterWord = sensitivewordFilter.getSensitiveWord(contactInformation, SensitivewordFilter.maxMatchType);
StringBuffer province = null;
StringBuffer city = null;
String area = null;
String connectAddress = "";
String address = "";
for (String s : filterWord) {
if(s.contains("省") || s.contains("自治区") ||s.contains("行政区")){
province = new StringBuffer(s);
}
if(s.contains("市")||s.contains("州")||s.contains("地区") ){
city = new StringBuffer(s);
}
if (s.contains("区")||s.contains("县")){
area = s;
}
}
if(StringUtils.isNotNull(city) && StringUtils.isNotNull(area)){
contactInformation = contactInformation.replace(city, "");
city = city.append(area);
connectAddress = city.toString();
}
if (StringUtils.isNotNull(province)){
contactInformation = contactInformation.replace(province, "");
connectAddress = province.append(city).toString();
}
expressAdress.setConnectAddress(connectAddress);
if(StringUtils.isNotNull(area)) {
address = contactInformation.substring(contactInformation.indexOf(area)+area.length());
contactInformation = contactInformation.replace(address, "");
contactInformation = contactInformation.replace(area, "");
expressAdress.setAddress(address);
}
expressAdress.setSender(contactInformation);
return expressAdress;
}
实体类
public class ExpressAdress extends BaseEntity {
private static final long serialVersionUID = 1L;
/**用户姓名*/
private String sender;
/**电话*/
private String phone;
/** 省 */
private String province;
/** 市 */
private String city;
/** 区 */
private String area;
/** 详细地址 */
private String address;
/**省市区合并*/
private String connectAddress;
public String getConnectAddress() {
return connectAddress;
}
public void setConnectAddress(String connectAddress) {
this.connectAddress = connectAddress;
}
public String getSender() {
return sender;
}
public void setSender(String sender) {
this.sender = sender;
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getArea() {
return area;
}
public void setArea(String area) {
this.area = area;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
}
基本都可以识别,有不足,大家可以自己完善或者提出来,一起探讨