中文分词代码,此代码为作者多年经验总结,以前发表过VB,PB版本

/*

* created by yzh 2004.5.12

* 请大家引用时保留这段作者声明,此代码为开源代码;使用不受限制。

* 中文分词代码

*此代码为作者多年经验总结,以前发表过VB,PB版本

*/

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.util.Locale;

import java.util.TreeMap;

import java.util.TreeSet;

public class ChineseSegmenter {

private static ChineseSegmenter segmenter = null;

// private Hashtable zhwords;

private TreeMap zhwords;

private TreeSet cforeign, cnumbers;

// Char form

public final static int TRAD = 0;

public final static int SIMP = 1;

public final static int BOTH = 2;

// Charform is TRAD, SIMP or BOTH

private ChineseSegmenter(int charform, boolean loadwordfile) {

cforeign = new TreeSet();

cnumbers = new TreeSet();

if (charform == SIMP) {

loadset(cnumbers, "data/snumbers_u8.txt");

loadset(cforeign, "data/sforeign_u8.txt");

} else if (charform == TRAD) {

loadset(cnumbers, "data/tnumbers_u8.txt");

loadset(cforeign, "data/tforeign_u8.txt");

} else { // BOTH

loadset(cnumbers, "data/snumbers_u8.txt");

loadset(cforeign, "data/sforeign_u8.txt");

loadset(cnumbers, "data/tnumbers_u8.txt");

loadset(cforeign, "data/tforeign_u8.txt");

}

// zhwords = new Hashtable(120000);

zhwords = new TreeMap();

if (!loadwordfile) {

return;

}

String newword = null;

try {

InputStream worddata = null;

if (charform == SIMP) {

worddata = getClass().getResourceAsStream("simplexu8.txt");

} else if (charform == TRAD) {

worddata = getClass().getResourceAsStream("tradlexu8.txt");

} else if (charform == BOTH) {

worddata = getClass().getResourceAsStream("bothlexu8.txt");

}

BufferedReader in = new BufferedReader(new InputStreamReader(

worddata, "UTF8"));

while ((newword = in.readLine()) != null) {

if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {

zhwords.put(newword.intern(), "1");

if (newword.length() == 3) {

if (zhwords.containsKey(newword.substring(0, 2)

.intern()) == false) {

zhwords.put(newword.substring(0, 2).intern(), "2");

}

}

if (newword.length() == 4) {

if (zhwords.containsKey(newword.substring(0, 2)

.intern()) == false) {

zhwords.put(newword.substring(0, 2).intern(), "2");

}

if (zhwords.containsKey(newword.substring(0, 3)

.intern()) == false) {

zhwords.put(newword.substring(0, 3).intern(), "2");

}

}

}

}

in.close();

} catch (IOException e) {

e.printStackTrace();

}

}

public synchronized static void reset() {

ChineseSegmenter.segmenter = null;

}

public synchronized static ChineseSegmenter getGBSegmenter() {

Locale.setDefault(Locale.SIMPLIFIED_CHINESE);

if (ChineseSegmenter.segmenter == null) {

ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.SIMP, true);

}

return ChineseSegmenter.segmenter;

}

public synchronized static ChineseSegmenter getBig5Segmenter() {

Locale.setDefault(Locale.TRADITIONAL_CHINESE);

if (ChineseSegmenter.segmenter == null) {

ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.TRAD, true);

}

return ChineseSegmenter.segmenter;

}

private void loadset(TreeSet targetset, String sourcefile) {

String dataline;

try {

InputStream setdata = getClass().getResourceAsStream(sourcefile);

BufferedReader in = new BufferedReader(new InputStreamReader(

setdata, "UTF-8"));

while ((dataline = in.readLine()) != null) {

if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {

continue;

}

targetset.add(dataline.intern());

}

in.close();

} catch (Exception e) {

System.err.println("Exception loading data file" + sourcefile + " "

+ e);

e.printStackTrace();

}

}

public boolean isNumber(String testword) {

boolean result = true;

for (int i = 0; i < testword.length(); i++) {

if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) {

result = false;

break;

}

}

return result;

}

public boolean isAllForeign(String testword) {

boolean result = true;

for (int i = 0; i < testword.length(); i++) {

if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) {

result = false;

break;

}

}

return result;

}

public boolean isNotCJK(String testword) {

boolean result = true;

for (int i = 0; i < testword.length(); i++) {

if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {

result = false;

break;

}

}

return result;

}

public String segmentLine(String cline, String separator) {

StringBuffer currentword = new StringBuffer();

StringBuffer outline = new StringBuffer();

int i, clength;

char currentchar;

// separator = " ";

clength = cline.length();

for (i = 0; i < clength; i++) {

currentchar = cline.charAt(i);

if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS

|| isNumber(cline.substring(i, i + 1)) == true) {

// Character in CJK block

if (currentword.length() == 0) { // start looking for next

// word

if (i > 0

&& (Character.isWhitespace(cline.charAt(i - 1)) == false)) {

outline.append(separator);

}

currentword.append(currentchar);

} else {

if (zhwords.containsKey(new String(currentword.toString()

+ currentchar).intern()) == true

&& ((String) (zhwords.get(new String(currentword

.toString()

+ currentchar).intern()))).equals("1") == true) {

// word is in lexicon

currentword.append(currentchar);

} else if (isAllForeign(currentword.toString())

&& cforeign.contains(new String(

new char[] { currentchar }).intern())

&& i + 2 < clength

&& (zhwords.containsKey(cline.substring(i, i + 2)

.intern()) == false)) {

// Possible a transliteration of a foreign name

currentword.append(currentchar);

} else if (isNumber(currentword.toString())

&& cnumbers.contains(new String(

new char[] { currentchar }).intern())

/*

* && (i + 2 < clength) &&

* (zhwords.containsKey(cline.substring(i, i+2).intern()) ==

* false)

*/) {

// Put all consecutive number characters together

currentword.append(currentchar);

} else if ((zhwords.containsKey(new String(currentword

.toString()

+ currentchar).intern()))

&& (((String) (zhwords.get(new String(currentword

.toString()

+ currentchar).intern()))).equals("2") == true)

&& i + 1 < clength

&& (zhwords.containsKey(new String(currentword

.toString()

+ currentchar + cline.charAt(i + 1))

.intern()) == true)) {

// Starts a word in the lexicon

currentword.append(currentchar);

} else { // Start anew

outline.append(currentword.toString());

if (Character.isWhitespace(currentchar) == false) {

outline.append(separator);

}

currentword.setLength(0);

currentword.append(currentchar);

}

}

} else { // Not chinese character

// System.err.println("not cjk");

if (currentword.length() > 0) {

outline.append(currentword.toString());

if (Character.isWhitespace(currentchar) == false) {

outline.append(separator);

}

currentword.setLength(0);

}

outline.append(currentchar);

}

}

outline.append(currentword.toString());

return outline.toString();

// return offsets;

}

public static void main(String[] args) throws Exception {

ChineseSegmenter seg = ChineseSegmenter.getGBSegmenter();

System.out.println(seg.segmentLine("Some string in chinese.", " "));

}

}