-
Notifications
You must be signed in to change notification settings - Fork 1
/
ParseSentences.java
95 lines (78 loc) · 3.5 KB
/
ParseSentences.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package com.fiskkit;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import com.fiskkit.Fetcher;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
public class ParseSentences {
public static final String ENCODING = "utf-8";
SentenceDetectorME sentenceDetector;
public ParseSentences() {
// Find the pre-trained model file in our classpath
// We don't want or need to load this every time we parse a paragraph.
try (InputStream modelIn = ParseSentences.class.getClassLoader().getResourceAsStream("en-sent.bin")) {
// Moving this block is mostly a philosophical change. Which can be discussed.
SentenceModel model = new SentenceModel(modelIn);
sentenceDetector = new SentenceDetectorME(model);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Create the sentence detector and fetch the url.
* This is then written to a file with writeToFile();
*
* @param url the website url we will fetch our article from
* @param filename the name of the file
*/
public void doParse(String url, String filename) {
System.out.println("Parsing " + url + ", writing to " + filename);
List<String> paragraphs;
paragraphs = Fetcher.pullAndExtract(url);
writeToFile(paragraphs, sentenceDetector, filename);
}
/**
* Parses the paragraphs into sentences and writes them to a file.
* This method overwrites whichever file it is writing to.
*
* @param paragraphs the List of paragraph strings, obtained with
* @param sentenceDetector the detector we use to parse the paragraphs
* @param filename where the strings will be written to
*/
public void writeToFile(List<String> paragraphs, SentenceDetectorME sentenceDetector, String filename) {
String sentences[] = null;
String line = null;
// Index that keeps count throughout all of the paragraphs, not just one
int lineIndex = 0;
// Use BufferedWriter instead of Writer for newLine()
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), ENCODING))) {
for (String p : paragraphs) {
sentences = sentenceDetector.sentDetect(p);
for (String s : sentences) {
lineIndex++;
line = String.format("[%d] --> \"%s\"", lineIndex, s);
// System.out.println(line);
writer.write(line);
writer.newLine();
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] argv) {
ParseSentences parser = new ParseSentences();
// Sample articles we can parse
parser.doParse("http://www.weeklystandard.com/blogs/intel-chief-blasts-obama_802242.html", "intel.txt");
parser.doParse("http://www.thedailybeast.com/articles/2014/08/21/swat-lobby-takes-a-shot-at-congress.html", "swat.txt");
parser.doParse("http://www.thedailybeast.com/articles/2014/08/12/russia-s-suspicious-humanitarian-aid-for-ukraine-separatists.html", "russia.txt");
}
}