文本摘要提取(java实现)

2018-05-07 02:32阅读：

http://blog.sina.cn/dpool/blog/u/6528373442

解决题目：
文章提取摘要 本文使用方法：
（1）对文本分词和做词性标注（我使用的是stanfordNLP）。
（2）分别计算和提取词性为NN和VV的出现次数最高的前5个词，作为关键词。NN的 map（词，次数），VV的Map（词，次数）。
（3）根据句号等标点将句子断句。
（4）计算每个句子的权值。权值=该句子中出现的关键词的出现次数之和，即（2）中计算得到的关键词的次数之和，不是关键词的前权值按0计算。
（5）权值最高的前n个句子，可作为该文本的摘要。
测试结果：

部分代码（不包括分词和词性标注）：

package AutoTextSummary;
import java.awt.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import util.QuickSortForClassic;
import util.TextShuffle;
public class TextSummary {
//定义一个Map存储NN关键词+次数最多的5个,次数作为词重要性的权重
private Map NNKey;
//定义一个Map存储VV关键词+次数最多的5个，次数作为词重要性的权重
private Map VVKey;
//断句后的集合，Map（句子，权重），权重=关键词权重之和，只要不是前5的词权值都是0，前5名的词权值就是该词出现的次数
private Map sentence;
//断句标点集合
private HashSet punctuation;
//文章摘要句子集合
private String[] summary;
//取前n个关键句子作为摘要
private int n;

public TextSummary(int n){
NNKey = new HashMap();
VVKey = new HashMap();
//断句标点
punctuation = new HashSet();
punctuation.add('。');
punctuation.add('，');
punctuation.add('；');
punctuation.add('！');
summary = new String[3];
this.n=n;
}

public void getKeyWord(String srcDir){
//定义一个List，存储NN关键词+次数和VV+次数
ArrayList> keyList = new ArrayList>();
keyList.add(new HashMap());
keyList.add(new HashMap());
sentence = new HashMap();
//为了分词和词性标注后结果输出文本，可以不需要
String desDir = srcDir.substring(0, srcDir.lastIndexOf('\\')+1)+'pro' + srcDir.substring(srcDir.lastIndexOf('\\')+1);
ArrayList posSequence;
TextShuffle posText = new TextShuffle();
//分词+词性标注返回的是原文本的每一行做了分词和词性标注，这样的每一行组成的集合List
posSequence = (ArrayList) posText.shuffle(srcDir,desDir,sentence,punctuation);
//统计每个名词和动词作为关键词的出现次数，
String[] word;
String[] temp;
for(int i=0;i
word = posSequence.get(i).split(' ');
for(int j=0;j
temp = word[j].split('#');
//如果是NN
if('NN'.equals(temp[1])){
if(keyList.get(0).containsKey(temp[0])){
keyList.get(0).put(temp[0], keyList.get(0).get(temp[0])+1);
}
else{
keyList.get(0).put(temp[0], 0);
}
}//如果是VV
else if('VV'.equals(temp[1])){
if(keyList.get(1).containsKey(temp[0])){
keyList.get(1).put(temp[0], keyList.get(1).get(temp[0])+1);
}
else{
keyList.get(1).put(temp[0], 0);
}
}
}
}
//关键词排序，NN和VV各取最多的前5个
String[] NN;
String[] VV;
StringBuffer keyBuf = new StringBuffer();
QuickSortForClassic sort;
//先排NN
for(String key : keyList.get(0).keySet()){
keyBuf.append(key);
keyBuf.append(' ');
}
NN = keyBuf.toString().split(' ');
//快速排序
sort = new QuickSortForClassic(keyList.get(0));
sort.quick_sort(NN, 0, NN.length-1);
//清keyBuf
keyBuf.setLength(0);
//再排VV
for(String key : keyList.get(1).keySet()){
keyBuf.append(key);
keyBuf.append(' ');
}
VV = keyBuf.toString().split(' ');
//快速排序
sort = new QuickSortForClassic(keyList.get(1));
sort.quick_sort(VV, 0, VV.length-1);
//给NNKey和VVKey赋值,只取出现次数最多的前5个（已排好序）
for(int i=0;i<5;i++){
NNKey.put(NN, keyList.get(0).get(NN));
}
for(int i=0;i<5;i++){
VVKey.put(VV, keyList.get(1).get(VV));
}
}
//计算每个句子的权值=词的权值相加，不是关键词的权值是0.
private void claSentence(){
//先计算NN
for(String sectenceKey : sentence.keySet()){
for(int i=0;i
for(String wordKey : NNKey.keySet()){
if(contains(sectenceKey,wordKey)){
sentence.put(sectenceKey, sentence.get(sectenceKey)+NNKey.get(wordKey));
}
}
}
}
//再计算VV
for(String sectenceKey : sentence.keySet()){
for(int i=0;i
for(String wordKey : VVKey.keySet()){
if(contains(sectenceKey,wordKey)){
sentence.put(sectenceKey, sentence.get(sectenceKey)+VVKey.get(wordKey));
}
}
}
}
}
//计算权值前n的句子
private void getSummary(){
for(String sentenceKey : sentence.keySet()){
for(int i=0;i
if(summary!=null){
if(sentence.get(sentenceKey)>sentence.get(summary)){
swap(i,sentenceKey);
break;
}
}
else{
summary = sentenceKey;
break;
}
}
}
}
//判断一个句子是否包含关键词
private boolean contains(String sentence,String word){
int wordLength = word.length();
for(int i = 0;i+wordLength
if(word.equals(sentence.substring(i, i+wordLength))){
return true;
}
}
return false;
}
//连个字符串交换数组中的位置
private void swap(int position,String more){
if(position < n-1){
swap(position+1,summary[position]);
}
summary[position] = more;
}
public static void main(String[] args) {
//文件路径
String srcDir = 'C:\\Users\\Administrator\\Desktop\\作业\\文档摘要作业素材\\01.txt';
//构造文本摘要处理对象，n是取前n个关键句子作为文章摘要
TextSummary newTasx = new TextSummary(3);
//获取文本中的关键词：文本统计，分词+标注后，计算出NN和VV做多的前5个词
newTasx.getKeyWord(srcDir);
//计算每个句子的权值
newTasx.claSentence();
//计算权值最大的前n个句子
newTasx.getSummary();

//输出文本摘要
for(int i=0;i
System.out.println(newTasx.summary);
}
}
}

举报/Report

我的更多文章

下载客户端阅读体验更佳

APP专享

新浪博客

文本摘要提取(java实现)

分享

我的更多文章

下载客户端阅读体验更佳

疯狂捕鱼