|
1 | 1 | (ns opennlp.tools.train
|
2 | 2 | "This namespace contains tools used to train OpenNLP models"
|
3 | 3 | (:use [clojure.java.io :only [output-stream reader input-stream]])
|
4 |
| - (:import (opennlp.tools.util PlainTextByLineStream TrainingParameters) |
5 |
| - (opennlp.tools.util.model BaseModel ModelType) |
| 4 | + (:import (opennlp.tools.util PlainTextByLineStream |
| 5 | + TrainingParameters |
| 6 | + MarkableFileInputStreamFactory) |
| 7 | + (opennlp.tools.util.model BaseModel |
| 8 | + ModelType) |
6 | 9 | (opennlp.tools.dictionary Dictionary)
|
7 | 10 | (opennlp.tools.tokenize TokenizerME
|
8 | 11 | TokenizerModel
|
9 |
| - TokenSampleStream) |
| 12 | + TokenSampleStream |
| 13 | + TokenizerFactory) |
10 | 14 | (opennlp.tools.sentdetect SentenceDetectorME
|
11 | 15 | SentenceModel
|
12 |
| - SentenceSampleStream) |
| 16 | + SentenceSampleStream |
| 17 | + SentenceDetectorFactory) |
13 | 18 | (opennlp.tools.namefind NameFinderEventStream
|
14 | 19 | NameSampleDataStream
|
15 | 20 | NameFinderME
|
16 |
| - TokenNameFinderModel) |
17 |
| - (opennlp.tools.chunker ChunkerME ChunkSampleStream ChunkerModel) |
18 |
| - (opennlp.tools.parser ParseSampleStream ParserModel) |
| 21 | + TokenNameFinderModel |
| 22 | + TokenNameFinderFactory |
| 23 | + BioCodec) |
| 24 | + (opennlp.tools.chunker ChunkerME |
| 25 | + ChunkSampleStream |
| 26 | + ChunkerModel |
| 27 | + ChunkerFactory) |
| 28 | + (opennlp.tools.parser ParseSampleStream |
| 29 | + ParserModel) |
19 | 30 | (opennlp.tools.parser.lang.en HeadRules)
|
20 | 31 | (opennlp.tools.parser.chunking Parser)
|
21 | 32 | (opennlp.tools.postag POSTaggerME
|
22 | 33 | POSModel
|
23 | 34 | POSDictionary
|
24 | 35 | WordTagSampleStream
|
25 |
| - POSContextGenerator) |
| 36 | + POSTaggerFactory) |
26 | 37 | (opennlp.tools.doccat DoccatModel
|
27 | 38 | DocumentCategorizerME
|
28 |
| - DocumentSampleStream))) |
| 39 | + DocumentSampleStream |
| 40 | + DoccatFactory))) |
29 | 41 |
|
30 | 42 | (defn write-model
|
31 | 43 | "Write a model to disk"
|
|
53 | 65 | ([in] (train-treebank-chunker "en" in))
|
54 | 66 | ([lang in] (train-treebank-chunker lang in 100 5))
|
55 | 67 | ([lang in iter cut]
|
56 |
| - (with-open [rdr (reader in)] |
57 |
| - (ChunkerME/train |
58 |
| - lang |
59 |
| - (ChunkSampleStream. |
60 |
| - (PlainTextByLineStream. rdr)) |
61 |
| - cut iter)))) |
| 68 | + (ChunkerME/train |
| 69 | + lang |
| 70 | + (ChunkSampleStream. |
| 71 | + (PlainTextByLineStream. |
| 72 | + (MarkableFileInputStreamFactory. in) "UTF-8")) |
| 73 | + (ChunkerFactory.) |
| 74 | + (doto (TrainingParameters.) |
| 75 | + (.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter)) |
| 76 | + (.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut)))))) |
62 | 77 |
|
63 | 78 | (defn ^ParserModel train-treebank-parser
|
64 | 79 | "Returns a treebank parser based a training file and a set of head rules"
|
65 | 80 | ([in headrules] (train-treebank-parser "en" in headrules))
|
66 | 81 | ([lang in headrules] (train-treebank-parser lang in headrules 100 5))
|
67 | 82 | ([lang in headrules iter cut]
|
68 |
| - (with-open [rdr (reader headrules) |
69 |
| - fis (java.io.FileInputStream. in)] |
| 83 | + (with-open [rdr (reader headrules)] |
70 | 84 | (Parser/train
|
71 | 85 | lang
|
72 | 86 | (ParseSampleStream.
|
73 | 87 | (PlainTextByLineStream.
|
74 |
| - (.getChannel fis) "UTF-8")) |
75 |
| - (HeadRules. rdr) iter cut)))) |
| 88 | + (MarkableFileInputStreamFactory. in) "UTF-8")) |
| 89 | + (HeadRules. rdr) |
| 90 | + (doto (TrainingParameters.) |
| 91 | + (.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter)) |
| 92 | + (.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut))))))) |
76 | 93 |
|
77 | 94 |
|
78 | 95 | (defn ^TokenNameFinderModel train-name-finder
|
|
87 | 104 | ([lang in iter cut & {:keys [entity-type feature-gen classifier]
|
88 | 105 | ;;MUST be either "MAXENT" or "PERCEPTRON"
|
89 | 106 | :or {entity-type "default" classifier "MAXENT"}}]
|
90 |
| - (with-open [rdr (reader in)] |
91 |
| - (NameFinderME/train |
92 |
| - lang |
93 |
| - entity-type |
94 |
| - (->> rdr |
95 |
| - (PlainTextByLineStream.) |
96 |
| - (NameSampleDataStream.)) |
97 |
| - (doto (TrainingParameters.) |
98 |
| - (.put TrainingParameters/ALGORITHM_PARAM classifier) |
99 |
| - (.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter)) |
100 |
| - (.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut))) |
101 |
| - feature-gen {})))) |
| 107 | + |
| 108 | + (NameFinderME/train |
| 109 | + lang |
| 110 | + entity-type |
| 111 | + (NameSampleDataStream. |
| 112 | + (PlainTextByLineStream. |
| 113 | + (MarkableFileInputStreamFactory. in) "UTF-8")) |
| 114 | + (doto (TrainingParameters.) |
| 115 | + (.put TrainingParameters/ALGORITHM_PARAM classifier) |
| 116 | + (.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter)) |
| 117 | + (.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut))) |
| 118 | + (TokenNameFinderFactory. |
| 119 | + feature-gen {} (BioCodec.))))) |
102 | 120 |
|
103 | 121 | (defn ^TokenizerModel train-tokenizer
|
104 | 122 | "Returns a tokenizer based on given training file"
|
105 | 123 | ([in] (train-tokenizer "en" in))
|
106 | 124 | ([lang in] (train-tokenizer lang in 100 5))
|
107 | 125 | ([lang in iter cut]
|
108 |
| - (with-open [rdr (reader in)] |
109 |
| - (TokenizerME/train |
110 |
| - lang |
111 |
| - (->> rdr |
112 |
| - (PlainTextByLineStream.) |
113 |
| - (TokenSampleStream.)) |
114 |
| - false |
115 |
| - cut |
116 |
| - iter)))) |
| 126 | + (TokenizerME/train |
| 127 | + (TokenSampleStream. |
| 128 | + (PlainTextByLineStream. |
| 129 | + (MarkableFileInputStreamFactory. in) "UTF-8")) |
| 130 | + (TokenizerFactory. |
| 131 | + lang nil false nil) |
| 132 | + (doto (TrainingParameters.) |
| 133 | + (.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter)) |
| 134 | + (.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut)))))) |
117 | 135 |
|
118 | 136 | (defn ^POSModel train-pos-tagger
|
119 | 137 | "Returns a pos-tagger based on given training file"
|
120 | 138 | ([in] (train-pos-tagger "en" in))
|
121 | 139 | ([lang in] (train-pos-tagger lang in nil))
|
122 | 140 | ([lang in tagdict] (train-pos-tagger lang in tagdict 100 5))
|
123 | 141 | ([lang in tagdict iter cut]
|
124 |
| - (with-open [rdr (reader in)] |
125 |
| - (POSTaggerME/train |
126 |
| - lang |
127 |
| - (WordTagSampleStream. rdr) |
128 |
| - (ModelType/MAXENT) |
129 |
| - tagdict |
130 |
| - nil |
131 |
| - cut |
132 |
| - iter)))) |
| 142 | + (POSTaggerME/train |
| 143 | + lang |
| 144 | + (WordTagSampleStream. |
| 145 | + (PlainTextByLineStream. |
| 146 | + (MarkableFileInputStreamFactory. in) "UTF-8")) |
| 147 | + (POSTaggerFactory. nil tagdict) |
| 148 | + (doto (TrainingParameters.) |
| 149 | + (.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter)) |
| 150 | + (.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut)))))) |
133 | 151 |
|
134 | 152 | (defn ^SentenceModel train-sentence-detector
|
135 | 153 | "Returns a sentence model based on a given training file"
|
136 | 154 | ([in] (train-sentence-detector "en" in))
|
137 | 155 | ([lang in]
|
138 |
| - (with-open [rdr (reader in)] |
139 |
| - (SentenceDetectorME/train lang |
140 |
| - (->> rdr |
141 |
| - (PlainTextByLineStream.) |
142 |
| - (SentenceSampleStream.)) |
143 |
| - true |
144 |
| - nil)))) |
| 156 | + (SentenceDetectorME/train |
| 157 | + lang |
| 158 | + (SentenceSampleStream. |
| 159 | + (PlainTextByLineStream. |
| 160 | + (MarkableFileInputStreamFactory. in) "UTF-8")) |
| 161 | + (SentenceDetectorFactory. lang true nil nil) |
| 162 | + (TrainingParameters.)))) |
145 | 163 |
|
146 | 164 | (defn ^DoccatModel train-document-categorization
|
147 | 165 | "Returns a classification model based on a given training file"
|
148 | 166 | ([in] (train-document-categorization "en" in 1 100))
|
149 | 167 | ([lang in] (train-document-categorization lang in 1 100))
|
150 |
| - ([lang in cutoff] (train-document-categorization lang in cutoff 100)) |
151 |
| - ([lang in cutoff iterations] |
152 |
| - (with-open [rdr (reader in)] |
153 |
| - (DocumentCategorizerME/train lang |
154 |
| - (->> rdr |
155 |
| - (PlainTextByLineStream.) |
156 |
| - (DocumentSampleStream.)) |
157 |
| - cutoff iterations)))) |
| 168 | + ([lang in cut] (train-document-categorization lang in cut 100)) |
| 169 | + ([lang in cut iter] |
| 170 | + (DocumentCategorizerME/train |
| 171 | + lang |
| 172 | + (DocumentSampleStream. |
| 173 | + (PlainTextByLineStream. |
| 174 | + (MarkableFileInputStreamFactory. in) "UTF-8")) |
| 175 | + (doto (TrainingParameters.) |
| 176 | + (.put TrainingParameters/ITERATIONS_PARAM (Integer/toString iter)) |
| 177 | + (.put TrainingParameters/CUTOFF_PARAM (Integer/toString cut))) |
| 178 | + (DoccatFactory.)))) |
| 179 | + |
0 commit comments