基于字典树实现AC自动机实现内容关键词检索

 如何使用

private ACTrie ctgACTrie = new ACTrie(); //创建自动机

ctgACTrie.addKeyword("你好"); //丢入关键词

private String passThoughCTG(String name) {
  Collection<MatchInfo> emits = ctgACTrie.search(name);
  if (emits.isEmpty())
    return null;
  MatchInfo m = null;
  for (MatchInfo emit : emits) {
    if (m == null || emit.getEnd()-emit.getStart() > m.getEnd()-m.getStart()) {
      m = emit;
    }
  }
  return m.getKeyword().toString()
}


/*返回 你好*/
passThoughCTG("啊啊啊你好啊啊啊"); //获取这句话有没有关键字

创建自动机类

package com.xdf.udf.util;

import java.util.*;
import lombok.Data;

/**
 * @version V1.0
 * @ClassName ACTrie
 * @Description 基于字典树实现AC自动机
 * @Author DFT
 * @Date 2020/5/20 0020
 * @see "https://blog.youkuaiyun.com/qq_44011386/article/details/117958782"
 */
public class ACTrie {
  private boolean failureSetted = false; //是否建立了failure表
  private Node root; //根结点

  public ACTrie() {
    this.root = new Node(true);
  }

  /**
   * @Description 添加一组模式串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [sequences]
   * @return void
   **/
  public void addKeywordList(Collection<? extends CharSequence> sequences){
    if (sequences == null || sequences.isEmpty()) return;
    for (CharSequence sequence : sequences) {
      addKeyword(sequence);
    }
  }

  /**
   * @Description 添加一个模式串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [cs]
   * @return void
   **/
  public void addKeyword(CharSequence cs) {
    if (cs == null || cs.length() == 0) return;
    //  从根节点开始
    Node currentState = this.root;
    int len = cs.length();
    for (int i = 0; i < len; i++) {
      // 根据字符添加子节点并返回
      currentState = currentState.insert(cs.charAt(i));
    }
    // 将完整字符串添加到最后一个节点上
    currentState.addMatchInfo(cs);
  }

  /**
   * @Description  删除一个模式串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [cs]
   * @return void
   **/
  public void deleteKeyword(CharSequence cs){
    if (cs == null || cs.length() == 0) return;
    //  从根节点开始
    Node currentState = this.root;
    Node parent = this.root;
    int count = 0;
    int len = cs.length();
    for (int i = 0; i < len; i++) {
      currentState = currentState.childAt(cs.charAt(i));
      if(currentState==null) return;
      if(i==len-1) {
        if(!currentState.children().isEmpty()) return;
      } else if(currentState.children().size()>1 || (currentState.emit()!=null && !currentState.emit().isEmpty())) {
        parent = currentState;
        count = i + 1;
      }
    }
    parent.map.remove(cs.charAt(count));
  }

  /**
   * @Description 匹配模式串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [text]
   * @return java.util.Collection<com.dft.ACTrie.MatchInfo>
   **/
  public Collection<MatchInfo> search(String text) {
    if (!this.failureSetted) setFailNode();
    Node currentState = this.root;
    List<MatchInfo> matchInfos = new ArrayList<MatchInfo>();
    int len = text.length();
    for (int position = 0; position < len; position++) {
      Character character = text.charAt(position);
      currentState = currentState.nextNode(character);
      Collection<CharSequence> emits = currentState.emit();
      if (emits == null || emits.isEmpty()) {
        continue;
      }
      for (CharSequence emit : emits) {
        matchInfos.add(new MatchInfo(position - emit.length() + 1, position, emit));
      }
    }
    return matchInfos;
  }

  /**
   * @Description 判断是否存在匹配的字符串
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param [text]
   * @return boolean
   **/
  public boolean findAnyIn(String text){
    if (!this.failureSetted) setFailNode();
    boolean result = false;
    Node currentState = this.root;
    int len = text.length();
    for (int position = 0; position < len; position++) {
      Character c = text.charAt(position);
      currentState = currentState.nextNode(c);
      Collection<CharSequence> emits = currentState.emit();
      if (emits == null || emits.isEmpty()) {
        continue;
      }
      result = true;
    }
    return result;
  }

  /**
   * @Description 设置失败节点
   * @Author DFT
   * @Date 2020/5/24 0024
   * @Param []
   * @return void
   **/
  private void setFailNode() {
    //  创建一个队列
    Queue<Node> queue = new LinkedList<Node>();
    // 1.根节点的所有子节点失败节点都是根节点
    Collection<Node> rootChildren = this.root.children();
    for (Node rootChild : rootChildren) {
      // 设置失败节点为根节点
      rootChild.setFailure(this.root);
      // 将节点加入队列用于后续递归
      queue.add(rootChild);
    }
    // 使用广度优先搜索BFS,层次遍历节点来处理,每一个节点的失败路径
    while (!queue.isEmpty()) {
      // 从队列中取出一个节点作为父节点
      Node parentNode = queue.poll();
      // 获取该节点的所有子节点
      Collection<Node> children = parentNode.children();
      for (Node child : children) {
        queue.add(child);
        // 失败节点=父节点的失败节点的next节点
        Node failNode = parentNode.getFailure().nextNode(child.value);
        child.setFailure(failNode);
        child.addMatchInfo(failNode.emit());
      }
    }
    this.failureSetted = true;
  }

  private static class Node {
    private static final char EMPTY = '\0';
    private boolean isRoot = false;//是否为根结点
    private Map<Character, Node> map;//  子节点map
    private char value;// 节点的值
    private Node failure; // 失败节点
    private List<CharSequence> emits; // 输出

    public Node(char value) {
      this.value = value;
      map = new HashMap<Character, Node>();
      emits = new ArrayList<CharSequence>();
    }

    /**
     * @Description 通过带参数构造器创建根节点
     * @Author DFT
     * @Date 2020/5/24 0024
     * @Param [isRoot]
     * @return
     **/
    public Node(boolean isRoot) {
      this(EMPTY);
      this.isRoot = isRoot;
    }

    /**
     * @Description 根据字符添加子节点
     * @Author DFT
     * @Date 2020/5/24 0024
     * @Param [character]
     * @return com.dft.ACTrie.Node
     **/
    public Node insert(Character character) {
      //  先判断当前节点中是否包含目标字符的子节点
      Node node = this.map.get(character);
      if (node == null) {
        // 如果没有 创建一个新的节点
        node = new Node(character);
        // 添加到当前节点的map中
        map.put(character, node);
      }
      // 返回节点
      return node;
    }

    /**
     * @Description 根据给定字符获取子节点
     * @Author DFT
     * @Date 2020/5/24 0024
     * @Param [character]
     * @return com.dft.ACTrie.Node
     **/
    public Node childAt(Character character) {
      return map.get(character);
    }

    /**
     * @Description 根据给定字符跳转到下一个节点
     * @Author DFT
     * @Date 2020/5/24 0024
     * @Param [transition]
     * @return com.dft.ACTrie.Node
     **/
    private Node nextNode(Character c) {
      // 在子节点中获取next节点
      Node next = this.childAt(c);
      if (next != null) {
        return next;
      }
      //如果跳转到根结点还是失败,则返回根结点
      if (this.isRoot) {
        return this;
      }
      // 按失败节点递归
      return this.failure.nextNode(c);
    }

    public void addMatchInfo(CharSequence cs) {
      emits.add(cs);
    }

    public void addMatchInfo(Collection<CharSequence> keywords) {
      emits.addAll(keywords);
    }

    public Collection<Node> children() {
      return this.map.values();
    }

    public void setFailure(Node node) {
      failure = node;
    }

    public Node getFailure() {
      return failure;
    }

    public Collection<CharSequence> emit() {
      return this.emits == null ? Collections.<CharSequence>emptyList() : this.emits;
    }
  }

  @Data
  public static class MatchInfo {
    private final CharSequence keyword;// 匹配到的模式串
    private final int start;
    private final int end;

    /**
     * 模式匹配结果
     */
    public MatchInfo(final int start, final int end, final CharSequence keyword) {
      this.start = start;
      this.end = end;
      this.keyword = keyword;
    }

  }

  // demo
  public static void main(String[] args) {
    List<String> keywords = Arrays.asList("coxquery#@{","coxquery#@config#@{","coxqueryhealth#@{","agent#@{","agent#@config#@{","agenthealth#@{");
    ACTrie trie = new ACTrie();
    trie.addKeyword("ctg#@{");
    trie.addKeyword("ctg#@config#@{");
    trie.addKeyword("ctghealth#@{");
    trie.addKeyword("cox#@{");
    trie.addKeyword("cox#@config#@{");
    trie.addKeyword("coxhealth#@{");
    trie.addKeywordList(keywords);

    trie.deleteKeyword("coxhealth#@{");
    System.out.println(trie.findAnyIn("#@monitor#@dataquery#@coxhealth#@{"));
    System.out.println(trie.findAnyIn("#@monitor#@dataquery#@cox#@config#@{"));
    Collection<MatchInfo> emits = trie.search("#@monitor#@dataquery#@coxquery#@config#@{");
    for (MatchInfo emit : emits) {
      System.out.println(emit);
    }
  }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值