NaturalLanguageProcessing.java 5.0 KB
Newer Older
Z
zhourui 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
package com.x.base.core.project.nlp;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import com.x.base.core.project.gson.GsonPropertyObject;

public class NaturalLanguageProcessing {

	public static String[] SKIP_START_WITH = new String[] { "~", "!", "#", "$", "%", "^", "&", "*", "(", ")", "<", ">",
			"[", "]", "{", "}", "\\", "?" };

	public static String[] SKIP_END_WITH = new String[] { "~", "!", "#", "$", "%", "^", "&", "*", "(", ")", "<", ">",
			"[", "]", "{", "}", "\\", "?" };

	public List<Item> word(String content) {
		List<Item> items = new ArrayList<>();
		if (StringUtils.isNotBlank(content)) {
			for (Term t : HanLP.segment(content)) {
				Item item = new Item();
				item.setLabel(t.nature.toString());
				/* 去掉中文空格和空格 */
				item.setValue(StringUtils.trimToEmpty(StringUtils.replace(t.word, " ", " ")));
				if (!skip(item)) {
					items.add(item);
				}
			}
		}
		/*
		 * b 区别词 c 连词 d 副词 e 叹词 f 方位词 h 前缀 k 后缀 o 拟声词 p 介词 q 量词 r 代词 u 组词 w 标点
		 */
		items = items.stream()
//				.filter(o -> (StringUtils.length(o.getValue()) > 1)
//						&& (!StringUtils.startsWithAny(o.getValue(), SKIP_START_WITH))
//						&& (!StringUtils.endsWithAny(o.getValue(), SKIP_END_WITH))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "b"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "c"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "d"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "e"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "f"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "h"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "k"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "o"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "p"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "q"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "r"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "u"))
//						&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "w")) && (!label_skip_m(o)))
				.collect(Collectors.toList());
		Map<Item, Long> map = items.stream().collect(Collectors.groupingBy(Function.identity(), Collectors.counting()));
		List<Item> list = new ArrayList<>();
		map.entrySet().stream().sorted(Map.Entry.<Item, Long>comparingByValue().reversed()).forEach(o -> {
			Item t = o.getKey();
			t.setCount(o.getValue());
			list.add(t);
		});
		return list;
	}

	private boolean skip(Item o) {
		if ((StringUtils.length(o.getValue()) > 1) && (!StringUtils.startsWithAny(o.getValue(), SKIP_START_WITH))
				&& (!StringUtils.endsWithAny(o.getValue(), SKIP_END_WITH))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "b"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "c"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "d"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "e"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "f"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "h"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "k"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "o"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "p"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "q"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "r"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "u"))
				&& (!StringUtils.startsWithIgnoreCase(o.getLabel(), "w")) && (!label_skip_m(o))) {
			return false;
		}
		return true;
	}

	private boolean label_skip_m(Item item) {
		if (!StringUtils.startsWithIgnoreCase(item.getLabel(), "m")) {
			return false;
		} else {
			return NumberUtils.isParsable(item.getValue());
		}
	}

	public static class Item extends GsonPropertyObject {

		private String value;

		private String label;

		private Long count;

		public String getValue() {
			return value;
		}

		public void setValue(String value) {
			this.value = value;
		}

		public String getLabel() {
			return label;
		}

		public void setLabel(String label) {
			this.label = label;
		}

		public Long getCount() {
			return count;
		}

		public void setCount(Long count) {
			this.count = count;
		}

		@Override
		public int hashCode() {
			final int prime = 31;
			int result = 1;
			result = prime * result + ((value == null) ? 0 : value.hashCode());
			return result;
		}

		@Override
		public boolean equals(Object obj) {
			if (this == obj)
				return true;
			if (obj == null)
				return false;
			if (getClass() != obj.getClass())
				return false;
			Item other = (Item) obj;
			if (value == null) {
				if (other.value != null)
					return false;
			} else if (!value.equals(other.value))
				return false;
			return true;
		}
	}
}