8000 Updated for OpenNLP 1.7 · dakrone/clojure-opennlp@d9373c9 · GitHub
[go: up one dir, main page]

Skip to content

Commit d9373c9

Browse files
Updated for OpenNLP 1.7
The older, deprecated constructors and training APIs are gone so the invocations of the various training functions had to be updated.
1 parent 300ff0f commit d9373c9

File tree

1 file changed

+86
-64
lines changed

1 file changed

+86
-64
lines changed

src/opennlp/tools/train.clj

Lines changed: 86 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,43 @@
11
(ns opennlp.tools.train
22
"This namespace contains tools used to train OpenNLP models"
33
(:use [clojure.java.io :only [output-stream reader input-stream]])
4-
(:import (opennlp.tools.util PlainTextByLineStream TrainingParameters)
5-
(opennlp.tools.util.model BaseModel ModelType)
4+
(:import (opennlp.tools.util PlainTextByLineStream
5+
TrainingParameters
6+
MarkableFileInputStreamFactory)
7+
(opennlp.tools.util.model BaseModel
8+
ModelType)
69
(opennlp.tools.dictionary Dictionary)
710
(opennlp.tools.tokenize TokenizerME
811
TokenizerModel
9-
TokenSampleStream)
12+
TokenSampleStream
13+
TokenizerFactory)
1014
(opennlp.tools.sentdetect SentenceDetectorME
1115
SentenceModel
12-
SentenceSampleStream)
16+
SentenceSampleStream
17+
SentenceDetectorFactory)
1318
(opennlp.tools.namefind NameFinderEventStream
1419
NameSampleDataStream
1520
NameFinderME
16-
TokenNameFinderModel)
17-
(opennlp.tools.chunker ChunkerME ChunkSampleStream ChunkerModel)
18-
(opennlp.tools.parser ParseSampleStream ParserModel)
21+
TokenNameFinderModel
22+
TokenNameFinderFactory
23+
BioCodec)
24+
(opennlp.tools.chunker ChunkerME
25+
ChunkSampleStream
26+
ChunkerModel
27+
ChunkerFactory)
28+
(opennlp.tools.parser ParseSampleStream
29+
ParserModel)
1930
(opennlp.tools.parser.lang.en HeadRules)
2031
(opennlp.tools.parser.chunking Parser)
2132
(opennlp.tools.postag POSTaggerME
2233
POSModel
2334
POSDictionary
2435
WordTagSampleStream
25-
POSContextGenerator)
36+
POSTaggerFactory)
2637
(opennlp.tools.doccat DoccatModel
2738
DocumentCategorizerME
28-
DocumentSampleStream)))
39+
DocumentSampleStream
40+
DoccatFactory)))
2941

3042
(defn write-model
3143
"Write a model to disk"
@@ -53,26 +65,31 @@
5365
([in] (train-treebank-chunker "en" in))
5466
([lang in] (train-treebank-chunker lang in 100 5))
5567
([lang in iter cut]
56-
(with-open [rdr (reader in)]
57-
(ChunkerME/train
58-
lang
59-
(ChunkSampleStream.
60-
(PlainTextByLineStream. rdr))
61-
cut iter))))
68+
(ChunkerME/train
69+
lang
70+
(ChunkSampleStream.
71+
(PlainTextByLineStream.
72+
(MarkableFileInputStreamFactory. in) "UTF-8"))
73+
(ChunkerFactory.)
74+
(doto (TrainingParameters.)
75+
(.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter))
76+
(.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut))))))
6277

6378
(defn ^ParserModel train-treebank-parser
6479
"Returns a treebank parser based a training file and a set of head rules"
6580
([in headrules] (train-treebank-parser "en" in headrules))
6681
([lang in headrules] (train-treebank-parser lang in headrules 100 5))
6782
([lang in headrules iter cut]
68-
(with-open [rdr (reader headrules)
69-
fis (java.io.FileInputStream. in)]
83+
(with-open [rdr (reader headrules)]
7084
(Parser/train
7185
lang
7286
(ParseSampleStream.
7387
(PlainTextByLineStream.
74-
(.getChannel fis) "UTF-8"))
75-
(HeadRules. rdr) iter cut))))
88+
(MarkableFileInputStreamFactory. in) "UTF-8"))
89+
(HeadRules. rdr)
90+
(doto (TrainingParameters.)
91+
(.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter))
92+
(.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut)))))))
7693

7794

7895
(defn ^TokenNameFinderModel train-name-finder
@@ -87,71 +104,76 @@
87104
([lang in iter cut & {:keys [entity-type feature-gen classifier]
88105
;;MUST be either "MAXENT" or "PERCEPTRON"
89106
:or {entity-type "default" classifier "MAXENT"}}]
90-
(with-open [rdr (reader in)]
91-
(NameFinderME/train
92-
lang
93-
entity-type
94-
(->> rdr
95-
(PlainTextByLineStream.)
96-
(NameSampleDataStream.))
97-
(doto (TrainingParameters.)
98-
(.put TrainingParameters/ALGORITHM_PARAM classifier)
99-
(.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter))
100-
(.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut)))
101-
feature-gen {}))))
107+
108+
(NameFinderME/train
109+
lang
110+
entity-type
111+
(NameSampleDataStream.
112+
(PlainTextByLineStream.
113+
(MarkableFileInputStreamFactory. in) "UTF-8"))
114+
(doto (TrainingParameters.)
115+
(.put TrainingParameters/ALGORITHM_PARAM classifier)
116+
(.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter))
117+
(.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut)))
118+
(TokenNameFinderFactory.
119+
feature-gen {} (BioCodec.)))))
102120

103121
(defn ^TokenizerModel train-tokenizer
104122
"Returns a tokenizer based on given training file"
105123
([in] (train-tokenizer "en" in))
106124
([lang in] (train-tokenizer lang in 100 5))
107125
([lang in iter cut]
108-
(with-open [rdr (reader in)]
109-
(TokenizerME/train
110-
lang
111-
(->> rdr
112-
(PlainTextByLineStream.)
113-
(TokenSampleStream.))
114-
false
115-
cut
116-
iter))))
126+
(TokenizerME/train
127+
(TokenSampleStream.
128+
(PlainTextByLineStream.
129+
(MarkableFileInputStreamFactory. in) "UTF-8"))
130+
(TokenizerFactory.
131+
lang nil false nil)
132+
(doto (TrainingParameters.)
133+
(.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter))
134+
(.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut))))))
117135

118136
(defn ^POSModel train-pos-tagger
119137
"Returns a pos-tagger based on given training file"
120138
([in] (train-pos-tagger "en" in))
121139
([lang in] (train-pos-tagger lang in nil))
122140
([lang in tagdict] (train-pos-tagger lang in tagdict 100 5))
123141
([lang in tagdict iter cut]
124-
(with-open [rdr (reader in)]
125-
(POSTaggerME/train
126-
lang
127-
(WordTagSampleStream. rdr)
128-
(ModelType/MAXENT)
129-
tagdict
130-
nil
131-
cut
132-
iter))))
142+
(POSTaggerME/train
143+
lang
144+
(WordTagSampleStream.
145+
(PlainTextByLineStream.
146+
(MarkableFileInputStreamFactory. in) "UTF-8"))
147+
(POSTaggerFactory. nil tagdict)
148+
(doto (TrainingParameters.)
149+
(.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter))
150+
(.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut))))))
133151

134152
(defn ^SentenceModel train-sentence-detector
135153
"Returns a sentence model based on a given training file"
136154
([in] (train-sentence-detector "en" in))
137155
([lang in]
138-
(with-open [rdr (reader in)]
139-
(SentenceDetectorME/train lang
140-
(->> rdr
141-
(PlainTextByLineStream.)
142-
(SentenceSampleStream.))
143-
true
144-
nil))))
156+
(SentenceDetectorME/train
157+
lang
158+
(SentenceSampleStream.
159+
(PlainTextByLineStream.
160+
(MarkableFileInputStreamFactory. in) "UTF-8"))
161+
(SentenceDetectorFactory. lang true nil nil)
162+
(TrainingParameters.))))
145163

146164
(defn ^DoccatModel train-document-categorization
147165
"Returns a classification model based on a given training file"
148166
([in] (train-document-categorization "en" in 1 100))
149167
([lang in] (train-document-categorization lang in 1 100))
150-
([lang in cutoff] (train-document-categorization lang in cutoff 100))
151-
([lang in cutoff iterations]
152-
(with-open [rdr (reader in)]
153-
(DocumentCategorizerME/train lang
154-
(->> rdr
155-
(PlainTextByLineStream.)
156-
(DocumentSampleStream.))
157-
cutoff iterations))))
168+
([lang in cut] (train-document-categorization lang in cut 100))
169+
([lang in cut iter]
170+
(DocumentCategorizerME/train
171+
lang
172+
(DocumentSampleStream.
173+
(PlainTextByLineStream.
174+
(MarkableFileInputStreamFactory. in) "UTF-8"))
175+
(doto (TrainingParameters.)
176+
(.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter))
177+
(.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut)))
178+
(DoccatFactory.))))
179+

0 commit comments

Comments
 (0)
0