首页 > 开发 > Java > 正文

Java实现AC自动机全文检索示例

2024-07-13 10:03:45
字体:
来源:转载
供稿:网友

第一步,构建Trie树,定义Node类型:

/** * Created by zhaoyy on 2017/2/7. */interface Node {  char value();  boolean exists();  boolean isRoot();  Node parent();  Node childOf(char c);  Node fail();  void setFail(Node node);  void setExists(boolean exists);  void add(Node child);  List<Node> children();}

第二步,实现两种Node,如果词汇全是可打印的ASCII字符,就采用AsciiNode,否则(比如包含汉字),使用基于hash表的MapNode;这两种Node均集成自AbstractNode:

/** * Created by zhaoyy on 2017/2/8. */abstract class AbstractNode implements Node {  private static final char EMPTY = '/0';  private final char value;  private final Node parent;  private boolean exists;  private Node fail;  public AbstractNode(Node parent, char value) {    this.parent = parent;    this.value = value;    this.exists = false;    this.fail = null;  }  public AbstractNode() {    this(null, EMPTY);  }  private static String tab(int n) {    StringBuilder builder = new StringBuilder();    for (int i = 0; i < n; i++) {      builder.append('/t');    }    return builder.toString();  }  private static String toString(Node node, int depth) {    StringBuilder builder = new StringBuilder();    String tab = tab(depth);    Node fail = node.fail();    Node parent = node.parent();    builder        .append(tab)        .append('<')        .append(node.value())        .append(" exists=/"")        .append(node.exists())        .append('"')        .append(" parent=/"")        .append(parent == null ? "null" : parent.value())        .append('"')        .append(" fail=/"")        .append(fail == null ? "null" : fail.value())        .append("/">/r/n");    for (Node child : node.children())      builder.append(toString(child, depth + 1));    builder        .append(tab)        .append("</")        .append(node.value())        .append(">/r/n");    return builder.toString();  }  @Override  public char value() {    return value;  }  @Override  public boolean exists() {    return exists;  }  @Override  public boolean isRoot() {    return value == EMPTY;  }  @Override  public Node parent() {    return parent;  }  @Override  public Node fail() {    return fail;  }  @Override  public void setFail(Node node) {    this.fail = node;  }  @Override  public void setExists(boolean exists) {    this.exists = exists;  }  @Override  public String toString() {    return toString(this, 0);  }}//////////////////////////////////////////////////////////////////////////////////////////////** * Created by zhaoyy on 2017/2/8. */final class AsciiNode extends AbstractNode implements Node {  private static final char FROM = 32;  private static final char TO = 126;  private final Node[] children;  public AsciiNode() {    super();    this.children = new Node[TO - FROM + 1];  }  public AsciiNode(Node parent, char value) {    super(parent, value);    this.children = new Node[TO - FROM + 1];  }  @Override  public Node childOf(char c) {    if (c >= FROM && c <= TO)      return children[(int) c - FROM];    else return null;  }  @Override  public void add(Node child) {    int index = (int) child.value();    children[index - FROM] = child;  }  @Override  public List<Node> children() {    List<Node> nodes = new ArrayList<Node>();    for (Node child : children)      if (child != null)        nodes.add(child);    return nodes;  }}///////////////////////////////////////////////////////////////////////////////////////////////** * Created by zhaoyy on 2017/2/8. */final class MapNode extends AbstractNode implements Node {  private final Map<Character, Node> children;  public MapNode() {    super();    this.children = new HashMap<Character, Node>();  }  public MapNode(Node parent, char value) {    super(parent, value);    this.children = new HashMap<Character, Node>();  }  @Override  public Node childOf(char c) {    return children.get(c);  }  @Override  public void add(Node child) {    children.put(child.value(), child);  }  @Override  public List<Node> children() {    List<Node> nodes = new ArrayList<Node>();    nodes.addAll(children.values());    return nodes;  }}

第三步,

首先定义一个Node构造器:

/** * Created by zhaoyy on 2017/2/8. */public interface NodeMaker {  Node make(Node parent, char value);  Node makeRoot();}

然后构建AC自动机,实现创建及查找方法:

/** * Created by zhaoyy on 2017/2/7. */public final class WordTable {  private final Node root;  private WordTable(Collection<? extends CharSequence> words, NodeMaker maker) {    Node root = buildTrie(words, maker);    setFailNode(root);    this.root = root;  }  public static WordTable compile(Collection<? extends CharSequence> words) {    if (words == null || words.isEmpty())      throw new IllegalArgumentException();    final NodeMaker maker;    if (isAscii(words))      maker = new NodeMaker() {        @Override        public Node make(Node parent, char value) {          return new AsciiNode(parent, value);        }        @Override        public Node makeRoot() {          return new AsciiNode();        }      };    else maker = new NodeMaker() {      @Override      public Node make(Node parent, char value) {        return new MapNode(parent, value);      }      @Override      public Node makeRoot() {        return new MapNode();      }    };    return new WordTable(words, maker);  }  private static boolean isAscii(Collection<? extends CharSequence> words) {    for (CharSequence word : words) {      int len = word.length();      for (int i = 0; i < len; i++) {        int c = (int) word.charAt(i);        if (c < 32 || c > 126)          return false;      }    }    return true;  }  private static Node buildTrie(Collection<? extends CharSequence> sequences, NodeMaker maker) {    Node root = maker.makeRoot();    for (CharSequence sequence : sequences) {      int len = sequence.length();      Node current = root;      for (int i = 0; i < len; i++) {        char c = sequence.charAt(i);        Node node = current.childOf(c);        if (node == null) {          node = maker.make(current, c);          current.add(node);        }        current = node;        if (i == len - 1)          node.setExists(true);      }    }    return root;  }  private static void setFailNode(final Node root) {    root.setFail(null);    Queue<Node> queue = new LinkedList<Node>();    queue.add(root);    while (!queue.isEmpty()) {      Node parent = queue.poll();      Node temp;      for (Node child : parent.children()) {        if (parent.isRoot())          child.setFail(root);        else {          temp = parent.fail();          while (temp != null) {            Node node = temp.childOf(child.value());            if (node != null) {              child.setFail(node);              break;            }            temp = temp.fail();          }          if (temp == null)            child.setFail(root);        }        queue.add(child);      }    }  }  public boolean findAnyIn(CharSequence cs) {    int len = cs.length();    Node node = root;    for (int i = 0; i < len; i++) {      Node next = node.childOf(cs.charAt(i));      if (next == null) {        next = node.fail();        if (next == null) {          node = root;          continue;        }      }      if (next.exists())        return true;    }    return false;  }  public List<MatchInfo> search(CharSequence cs) {    if (cs == null || cs.length() == 0)      return Collections.emptyList();    List<MatchInfo> result = new ArrayList<MatchInfo>();    int len = cs.length();    Node node = root;    for (int i = 0; i < len; i++) {      Node next = node.childOf(cs.charAt(i));      if (next == null) {        next = node.fail();        if (next == null) {          node = root;          continue;        }      }      if (next.exists()) {        MatchInfo info = new MatchInfo(i, next);        result.add(info);        node = root;        continue;      }      node = next;    }    return result;  }  @Override  public String toString() {    return root.toString();  }}

定义一个保存查找结果的实体:

/** * Created by zhaoyy on 2017/2/7. */public final class MatchInfo {  private final int index;  private final String word;  public MatchInfo(int index, String word) {    this.index = index;    this.word = word;  }  public MatchInfo(int index, Node node) {    StringBuilder builder = new StringBuilder();    while (node != null) {      if (!node.isRoot())        builder.append(node.value());      node = node.parent();    }    String word = builder.reverse().toString();    this.index = index + 1 - word.length();    this.word = word;  }  public int getIndex() {    return index;  }  public String getWord() {    return word;  }  @Override  public String toString() {    return index + ":" + word;  }}

第四步,调用Demo:

public static void main(String[] args) {    List<String> list = Arrays.asList("say", "her", "he", "she", "shr", "alone");    WordTable table = WordTable.compile(list);    System.out.println(table);    System.out.println(table.search("1shesaynothingabouthislivinghimalone"));  }

以下是输出结果:

< exists="false" parent="null" fail="null"> <s exists="false" parent=" " fail=" "> <a exists="false" parent="s" fail="a">  <y exists="true" parent="a" fail=" ">  </y> </a> <h exists="false" parent="s" fail="h">  <e exists="true" parent="h" fail="e">  </e>  <r exists="true" parent="h" fail=" ">  </r> </h> </s> <h exists="false" parent=" " fail=" "> <e exists="true" parent="h" fail=" ">  <r exists="true" parent="e" fail=" ">  </r> </e> </h> <a exists="false" parent=" " fail=" "> <l exists="false" parent="a" fail=" ">  <o exists="false" parent="l" fail=" ">  <n exists="false" parent="o" fail=" ">   <e exists="true" parent="n" fail=" ">   </e>  </n>  </o> </l> </a></ >[1:she, 4:say, 31:alone]

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持VeVb武林网。


注:相关教程知识阅读请移步到JAVA教程频道。
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表