Java——ikanalyzer分词·只用自定义词库

需要包:
IKAnalyzer2012_FF_hf1.jar
lucene-core-5.5.4.jar
需要文件:

IKAnalyzer.cfg.xml

ext.dic

stopword.dic

整理好的下载地址:
http://download.csdn.net/detail/talkwah/9770635
import java.io.IOException; import java.io.StringReader; import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.cfg.DefaultConfig; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; public class FenCi { private static Configuration m_wordCut_cfg; public static void main(String[] args) throws IOException { String s = "这节课我们讲授c语言里的结构体和宏"; wordCut(s); } public static void wordCut(String query) throws IOException { m_wordCut_cfg = DefaultConfig.getInstance(); System.out.println(m_wordCut_cfg.getMainDictionary()); // 系统默认词库 System.out.println(m_wordCut_cfg.getQuantifierDicionary()); StringReader input = new StringReader(query.trim()); // true 用智能分词/false细粒度 IKSegmenter ikSeg = new IKSegmenter(input, true); Lexeme lexeme = ikSeg.next(); for (; lexeme != null; lexeme = ikSeg.next()) { // 禁用默认词典,只用自定义词典 // 1.默认词典设为停用词典 // 2.getLexemeType为64的直接跳过 int nType = lexeme.getLexemeType(); if (nType == 64) { continue; } System.out.print(lexeme.getLexemeText() + " "); } } }

ext.dic(此例中放在src文件夹下)

#第一行人家不算呢(大小写都能匹配)
C语言
结构体
宏

IKAnalyzer.cfg.xml(必须放在src文件夹下)

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
    <comment>IK Analyzer 扩展配置</comment>
    <!--用户可以在这里配置自己的扩展字典 -->
    <entry key="ext_dict">ext.dic;</entry>

    <!--用户可以在这里配置自己的扩展停止词字典 -->
    <entry key="ext_stopwords">stopword.dic;org/wltea/analyzer/dic/main2012.dic;</entry>
</properti

stopword.dic(没改)

结果:

org/wltea/analyzer/dic/main2012.dic
org/wltea/analyzer/dic/quantifier.dic
加载扩展词典:ext.dic
加载扩展停止词典:stopword.dic
加载扩展停止词典:org/wltea/analyzer/dic/main2012.dic
c语言 结构体 宏