JAVA版 中文地址 识别 切分源码

原文地址:https://www.vbox.top/38.html

几乎没接触过C#,不过跟java很相似,花了一天时间,将原作者的代码改成了java版。
Splitter.java文件如下

import java.util.regex.Pattern;

/**
 * Created by ajtdnyy on 13-9-3.
 */
public class Splitter {

    Pattern pattern;
    Pattern[] patterns;
    boolean flag = true;

    public Splitter(Pattern pattern) {
        this.pattern = pattern;
    }

    public Splitter(Pattern pattern, Pattern[] patterns) {
        this.pattern = pattern;
        this.patterns = patterns;
    }

    public Splitter(Pattern pattern, Pattern[] patterns, boolean flag) {
        this.pattern = pattern;
        this.flag = flag;
        this.patterns = patterns;
    }
}

Segment.java类如下

import java.util.regex.Pattern;

/**
 * Created by ajtdnyy on 13-9-3.
 */
public class Segment {

    String value;
    Pattern pattern;

    public Segment(String value, Pattern pattern) {
        this.value = value;
        this.pattern = pattern;
    }
}

ChineseAddress.java类如下

import java.util.List;

/**
 * Created by ajtdnyy on 13-9-3.
 */
public class ChineseAddress {

    public String source;
    public String nation;
    public String province;
    public String city;
    public String county;
    public String district;
    public String street;
    public List roads;
    public String number;
    public String plaza;
    public String ip;
    public String town;
    public String village;
    public String zone;
    public String underground;
    public List notes;
    public List noises;
    private static final String SEPARATOR = System.getProperty("line.separator");

    public String toString() {
        String s = "src: " + source + SEPARATOR;
        if (nation != null) {
            s = s + "nat: " + nation + SEPARATOR;
        }
        if (province != null) {
            s = s + "pro: " + province + SEPARATOR;
        }
        if (city != null) {
            s = s + "cit: " + city + SEPARATOR;
        }
        if (county != null) {
            s = s + "cou: " + county + SEPARATOR;
        }
        if (district != null) {
            s = s + "dis: " + district + SEPARATOR;
        }
        if (street != null) {
            s = s + "str: " + street + SEPARATOR;
        }
        if (number != null) {
            s = s + "num: " + number + SEPARATOR;
        }
        if (plaza != null) {
            s = s + "pla: " + plaza + SEPARATOR;
        }
        if (ip != null) {
            s = s + "idp: " + ip + SEPARATOR;
        }
        if (town != null) {
            s = s + "twn: " + town + SEPARATOR;
        }
        if (village != null) {
            s = s + "vil: " + village + SEPARATOR;
        }
        if (zone != null) {
            s = s + "zon: " + zone + SEPARATOR;
        }
        if (underground != null) {
            s = s + "udg: " + underground + SEPARATOR;
        }
        if (roads != null) {
            s = s + "rod: ";
            for (int i = 0; i < roads.size(); i++) {
                String r = roads.get(i);
                if (r == roads.get(0)) {
                    s = s + r;
                } else {
                    s = s + " / " + r;
                }
            }
            s = s + SEPARATOR;
        }
        if (notes != null) {
            s = s + "not: ";
            for (int i = 0; i < notes.size(); i++) {
                String n = notes.get(i);
                if (n == roads.get(0)) {
                    s = s + n;
                } else {
                    s = s + " / " + n;
                }
            }
            s = s + SEPARATOR;
        }
        if (noises != null) {
            s = s + "noi: ";
            for (int i = 0; i < noises.size(); i++) {
                s = s + noises.get(i) + " / ";
            }
            s = s + SEPARATOR;
        }
        return s;
    }
}

ChineseAddressParser.java类如下

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by ajtdnyy on 13-9-3.
 */
public class ChineseAddressParser {

    static final String reg = "[\u4e00-\u9fa5]";
    static final Pattern ms_Pattern_guo = Pattern.compile("中国");
    static final Pattern ms_Pattern_jinjiao = Pattern.compile("近郊");
    static final Pattern ms_Pattern_sheng = Pattern.compile(reg + "+?省");
    static final Pattern ms_Pattern_shi = Pattern.compile(reg + "+?市(?!场)");
    static final Pattern ms_Pattern_qu = Pattern.compile(reg + "+?区");
    static final Pattern ms_Pattern_xiang = Pattern.compile(reg + "+?乡");
    static final Pattern ms_Pattern_xian = Pattern.compile(reg + "+?县");
    static final Pattern ms_Pattern_dao = Pattern.compile(reg + "+?道");
    static final Pattern ms_Pattern_hutong = Pattern.compile(reg + "+?胡同");
    static final Pattern ms_Pattern_nongtang = Pattern.compile(reg + "+?弄堂");
    static final Pattern ms_Pattern_jie = Pattern.compile(reg + "+?街");
    static final Pattern ms_Pattern_xiangg = Pattern.compile(reg + "+?巷");
    static final Pattern ms_Pattern_lu = Pattern.compile(reg + "+?路");
    static final Pattern ms_Pattern_cun = Pattern.compile(reg + "+?村");
    static final Pattern ms_Pattern_zhen = Pattern.compile(reg + "+?镇");
    static final Pattern ms_Pattern_hao = Pattern.compile("[甲_乙_丙_0-9_-]+?号");
    static final Pattern ms_Pattern_point = Pattern.compile(reg + "+?(?:广场|酒店|饭店|宾馆|中心|大厦|百货|大楼|商城)");
    static final Pattern ms_Pattern_ditie = Pattern.compile("地铁" + reg + "+?线(?:" + reg + "+?站)?");
    static final Pattern ms_Pattern_province = Pattern.compile(reg + "{2,10}?(?:省|特区|自治区|特别行政区)");
    static final Pattern ms_Pattern_city = Pattern.compile(reg + "+?(?:市|地区|自治州)");
    static final Pattern ms_Pattern_county = Pattern.compile(reg + "+?(?:乡|县)");
    static final Pattern ms_Pattern_street = Pattern.compile(reg + "+?街道");
    static final Pattern ms_Pattern_road = Pattern.compile(reg + "+?(?:胡同|弄堂|街|巷|路|道)");
    static final Pattern ms_Pattern_roadnear = Pattern.compile("(?<=近)" + reg + "+?(?:胡同|弄堂|街|巷|路|道)");
    static final Pattern ms_Pattern_ip = Pattern.compile(reg + "+?(?:开发区|科技区|园区)");
    static final Pattern ms_Pattern_zone = Pattern.compile(reg + "+?(?:小区|社区|新村)");
    static final Pattern ms_Pattern_village = Pattern.compile(reg + "+?村");
    static final Pattern ms_Pattern_town = Pattern.compile(reg + "+?镇");
    static final Pattern ms_Pattern_number = Pattern.compile("[甲_乙_丙_0-9_-]+号");
    static final Pattern ms_Pattern_plaza = Pattern.compile(reg + "+?(?:广场|酒店|饭店|宾馆|中心|大厦|百货|大楼|商城)");
    static final Pattern ms_Pattern_underground = Pattern.compile("地铁" + reg + "+?线(?:" + reg + "+?站)?");
    static final Splitter ms_splitter_guo = new Splitter(ms_Pattern_guo, new Pattern[]{ms_Pattern_guo});
    static final Splitter ms_splitter_sheng = new Splitter(ms_Pattern_sheng, new Pattern[]{ms_Pattern_province});
    static final Splitter ms_splitter_shi = new Splitter(ms_Pattern_shi, new Pattern[]{ms_Pattern_city}, false);
    static final Splitter ms_splitter_jinjiao = new Splitter(ms_Pattern_jinjiao, new Pattern[]{ms_Pattern_jinjiao});
    static final Splitter ms_splitter_qu = new Splitter(ms_Pattern_qu, new Pattern[]{ms_Pattern_province, ms_Pattern_city, ms_Pattern_zone, ms_Pattern_ip, ms_Pattern_qu}, false);
    static final Splitter ms_splitter_xiang = new Splitter(ms_Pattern_xiang, new Pattern[]{ms_Pattern_county});
    static final Splitter ms_splitter_xian = new Splitter(ms_Pattern_xian, new Pattern[]{ms_Pattern_county});
    static final Splitter ms_splitter_dao = new Splitter(ms_Pattern_dao, new Pattern[]{ms_Pattern_street, ms_Pattern_roadnear, ms_Pattern_road}, false);
    static final Splitter ms_splitter_hutong = new Splitter(ms_Pattern_hutong, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
    static final Splitter ms_splitter_nongtang = new Splitter(ms_Pattern_nongtang, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
    static final Splitter ms_splitter_jie = new Splitter(ms_Pattern_jie, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
    static final Splitter ms_splitter_lu = new Splitter(ms_Pattern_lu, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
    static final Splitter ms_splitter_xiangg = new Splitter(ms_Pattern_xiangg, new Pattern[]{ms_Pattern_roadnear, ms_Pattern_road}, false);
    static final Splitter ms_splitter_cun = new Splitter(ms_Pattern_cun, new Pattern[]{ms_Pattern_zone, ms_Pattern_village});
    static final Splitter ms_splitter_zhen = new Splitter(ms_Pattern_zhen, new Pattern[]{ms_Pattern_town});
    static final Splitter ms_splitter_hao = new Splitter(ms_Pattern_hao, new Pattern[]{ms_Pattern_number});
    static final Splitter ms_splitter_point = new Splitter(ms_Pattern_point, new Pattern[]{ms_Pattern_plaza});
    static final Splitter ms_splitter_ditie = new Splitter(ms_Pattern_ditie, new Pattern[]{ms_Pattern_underground});
    static final Splitter[] ms_defaultsplitters = new Splitter[]{
            ms_splitter_guo,
            ms_splitter_sheng,
            ms_splitter_shi,
            ms_splitter_qu,
            ms_splitter_xiang,
            ms_splitter_xian,
            ms_splitter_dao,
            ms_splitter_hutong,
            ms_splitter_nongtang,
            ms_splitter_jie,
            ms_splitter_xiangg,
            ms_splitter_lu,
            ms_splitter_cun,
            ms_splitter_zhen,
            ms_splitter_hao,
            ms_splitter_point,
            ms_splitter_ditie,
            ms_splitter_jinjiao
    };

    private static LinkedHashMap<Integer, Splitter> split(String src, Splitter[] splitters) {
        LinkedHashMap<Integer, Splitter> splitterdic = new LinkedHashMap<Integer, Splitter>();
        for (Splitter s : splitters) {
            Matcher m = s.pattern.matcher(src);
            while (m.find()) {
                splitterdic.put(m.start() + m.group().length(), s);
                if (s.flag) {
                    break;
                }
            }
        }
        return splitterdic;
    }

    private static ArrayList recognize(String src, LinkedHashMap<Integer, Splitter> splitterdic) {
        Segment s;
        int index = 0;
        ArrayList segments = new ArrayList();
        if (src.length() > 0) {
            for (Integer key : splitterdic.keySet()) {
                Splitter value = splitterdic.get(key);
                if (key > index && key < src.length()) {
                    for (Pattern r : value.patterns) {
                        s = segmentRecognize(src.substring(index, key), r);
                        if (s != null) {
                            segments.add(s);
                            break;
                        }
                    }
                    index = key;
                }
            }
        }
        return segments;
    }

    private static Segment segmentRecognize(String src, Pattern r) {
        Matcher m = r.matcher(src);
        if (m.matches()) {
            return new Segment(m.group(), r);
        } else {
            return null;
        }
    }

    private static ArrayList segmentsGetStringListForPattern(ArrayList segments, Pattern r) {
        ArrayList ss = new ArrayList();
        for (Iterator it = segments.iterator(); it.hasNext();) {
            Segment s = it.next();
            if (s.pattern == r) {
                ss.add(s.value);
            }
        }
        return ss;
    }

    private static String segmentsGetStringForPattern(ArrayList segments, Pattern r) {
        for (Iterator it = segments.iterator(); it.hasNext();) {
            Segment s = it.next();
            if (s.pattern == r) {
                return s.value;
            }
        }
        return null;
    }

    public static void main(String[] args) {
        System.out.println(ChineseAddressParser.parse("北京市海淀区中关村北大街37号天龙大厦3层"));
        System.out.println(ChineseAddressParser.parse("福州市台江区群众路278号源利明珠大厦6楼"));
        System.out.println(ChineseAddressParser.parse("北京西城区百万庄大街68号6楼"));
    }

    public static ChineseAddress parse(String source) {
        source = source.replace(".", "").replace(",", "").replace(",", "");
        ArrayList segments = recognize(source, split(source, ms_defaultsplitters));
        ChineseAddress ca = new ChineseAddress();
        ca.source = source;

        ca.nation = segmentsGetStringForPattern(segments, ms_Pattern_guo);
        ca.province = segmentsGetStringForPattern(segments, ms_Pattern_province);
        ca.city = segmentsGetStringForPattern(segments, ms_Pattern_city);
        ca.district = segmentsGetStringForPattern(segments, ms_Pattern_qu);
        ca.county = segmentsGetStringForPattern(segments, ms_Pattern_county);
        ca.street = segmentsGetStringForPattern(segments, ms_Pattern_street);

        ArrayList roads = segmentsGetStringListForPattern(segments, ms_Pattern_road);
        ArrayList near = segmentsGetStringListForPattern(segments, ms_Pattern_roadnear);
        for (Iterator it = near.iterator(); it.hasNext();) {
            roads.add(it.next());

        }
        ca.roads = roads;

        ca.underground = segmentsGetStringForPattern(segments, ms_Pattern_underground);
        ca.number = segmentsGetStringForPattern(segments, ms_Pattern_number);
        ca.plaza = segmentsGetStringForPattern(segments, ms_Pattern_plaza);
        ca.ip = segmentsGetStringForPattern(segments, ms_Pattern_ip);
        ca.town = segmentsGetStringForPattern(segments, ms_Pattern_town);
        ca.village = segmentsGetStringForPattern(segments, ms_Pattern_village);
        return ca;
    }
}

原作者C#博客地址:http://blog.csdn.net/helanmouse/article/details/4096933

You may also like...

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注

此站点使用Akismet来减少垃圾评论。了解我们如何处理您的评论数据