|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
"""A Python wrapper for Stanford CoreNLP's SUTime.""" |
|
3
|
|
|
|
|
4
|
|
|
import glob |
|
5
|
|
|
import importlib |
|
6
|
|
|
import json |
|
7
|
|
|
import logging |
|
8
|
|
|
import os |
|
9
|
|
|
import socket |
|
10
|
|
|
import sys |
|
11
|
|
|
import threading |
|
12
|
|
|
from pathlib import Path |
|
13
|
|
|
from typing import Dict, List, Optional |
|
14
|
|
|
|
|
15
|
|
|
import jpype # pyre-ignore[21] |
|
16
|
|
|
|
|
17
|
|
|
SOCKED_DEFAULT_TIMEOUT = 15 |
|
18
|
|
|
socket.setdefaulttimeout(SOCKED_DEFAULT_TIMEOUT) |
|
19
|
|
|
|
|
20
|
|
|
|
|
21
|
|
|
class SUTime(object): |
|
22
|
|
|
"""Python wrapper for SUTime (CoreNLP) by Stanford.""" |
|
23
|
|
|
|
|
24
|
|
|
_sutime_python_jar = 'stanford-corenlp-sutime-python-1.4.0.jar' |
|
25
|
|
|
_sutime_java_class = 'edu.stanford.nlp.python.SUTimeWrapper' |
|
26
|
|
|
_corenlp_version = '4.0.0' |
|
27
|
|
|
|
|
28
|
|
|
# full name or ISO 639-1 code |
|
29
|
|
|
_languages = { |
|
30
|
|
|
'arabic': 'arabic', |
|
31
|
|
|
'ar': 'arabic', |
|
32
|
|
|
'chinese': 'chinese', |
|
33
|
|
|
'zh': 'chinese', |
|
34
|
|
|
'english': 'english', |
|
35
|
|
|
'british': 'british', |
|
36
|
|
|
'en': 'english', |
|
37
|
|
|
'french': 'french', |
|
38
|
|
|
'fr': 'french', |
|
39
|
|
|
'german': 'german', |
|
40
|
|
|
'de': 'german', |
|
41
|
|
|
'spanish': 'spanish', |
|
42
|
|
|
'es': 'spanish', |
|
43
|
|
|
} |
|
44
|
|
|
|
|
45
|
|
|
# https://github.com/stanfordnlp/CoreNLP/tree/master/src/edu/stanford/nlp/time/rules |
|
46
|
|
|
_supported_languages = {'british', 'english', 'spanish'} |
|
47
|
|
|
|
|
48
|
|
|
_required_jars = { |
|
49
|
|
|
'stanford-corenlp-{0}-models.jar'.format('4.0.0'), |
|
50
|
|
|
'stanford-corenlp-{0}.jar'.format('4.0.0'), |
|
51
|
|
|
'gson-2.8.6.jar', |
|
52
|
|
|
'slf4j-simple-1.7.30.jar', |
|
53
|
|
|
} |
|
54
|
|
|
|
|
55
|
|
|
def __init__( |
|
56
|
|
|
self, |
|
57
|
|
|
jars: Optional[str] = None, |
|
58
|
|
|
jvm_started: Optional[bool] = False, |
|
59
|
|
|
mark_time_ranges: Optional[bool] = False, |
|
60
|
|
|
include_range: Optional[bool] = False, |
|
61
|
|
|
jvm_flags: Optional[List[str]] = None, |
|
62
|
|
|
language: Optional[str] = 'english', |
|
63
|
|
|
): |
|
64
|
|
|
"""Initialize `SUTime` wrapper. |
|
65
|
|
|
|
|
66
|
|
|
Args: |
|
67
|
|
|
jars (Optional[str]): Path to previously downloaded SUTime Java |
|
68
|
|
|
dependencies. Defaults to False. |
|
69
|
|
|
jvm_started (Optional[bool]): Flag to indicate that JVM has been |
|
70
|
|
|
already started (with all Java dependencies loaded). Defaults |
|
71
|
|
|
to False. |
|
72
|
|
|
mark_time_ranges (Optional[bool]): SUTime flag for |
|
73
|
|
|
sutime.markTimeRanges. Defaults to False. |
|
74
|
|
|
"Whether or not to recognize time ranges such as 'July to |
|
75
|
|
|
August'" |
|
76
|
|
|
include_range (Optional[bool]): SUTime flag for |
|
77
|
|
|
sutime.includeRange. Defaults to False. |
|
78
|
|
|
"Whether or not to add range info to the TIMEX3 object" |
|
79
|
|
|
jvm_flags (Optional[List[str]]): List of flags passed to JVM. For |
|
80
|
|
|
example, this may be used to specify the maximum heap size |
|
81
|
|
|
using '-Xmx'. Has no effect if `jvm_started` is set to True. |
|
82
|
|
|
Defaults to None. |
|
83
|
|
|
language (Optional[str]): Selected language. Currently supported |
|
84
|
|
|
are: english (/en), british, spanish (/es). Defaults to |
|
85
|
|
|
`english`. |
|
86
|
|
|
""" |
|
87
|
|
|
self.mark_time_ranges = mark_time_ranges |
|
88
|
|
|
self.include_range = include_range |
|
89
|
|
|
self._is_loaded = False |
|
90
|
|
|
self._lock = threading.Lock() |
|
91
|
|
|
module_root = Path(__file__).resolve().parent |
|
92
|
|
|
self.jars = Path(jars) if jars else module_root / 'jars' |
|
93
|
|
|
|
|
94
|
|
|
self._check_language_model_dependency( |
|
95
|
|
|
language.lower() if language else '', |
|
96
|
|
|
) |
|
97
|
|
|
|
|
98
|
|
|
if not jvm_started: |
|
99
|
|
|
self._classpath = self._create_classpath() |
|
100
|
|
|
self._start_jvm(jvm_flags) |
|
101
|
|
|
|
|
102
|
|
|
try: |
|
103
|
|
|
# make it thread-safe |
|
104
|
|
|
if threading.active_count() > 1: |
|
105
|
|
|
if not jpype.isThreadAttachedToJVM(): |
|
106
|
|
|
jpype.attachThreadToJVM() |
|
107
|
|
|
self._lock.acquire() |
|
108
|
|
|
wrapper = jpype.JClass(self._sutime_java_class) |
|
109
|
|
|
self._sutime = wrapper( |
|
110
|
|
|
self.mark_time_ranges, self.include_range, language, |
|
111
|
|
|
) |
|
112
|
|
|
self._is_loaded = True |
|
113
|
|
|
except Exception as exc: |
|
114
|
|
|
sys.exit('Could not load JVM: {0}'.format(exc)) |
|
115
|
|
|
finally: |
|
116
|
|
|
self._lock.release() |
|
117
|
|
|
|
|
118
|
|
|
def parse( |
|
119
|
|
|
self, input_str: str, reference_date: Optional[str] = '', |
|
120
|
|
|
) -> List[Dict]: |
|
121
|
|
|
"""Parse datetime information out of string input. |
|
122
|
|
|
|
|
123
|
|
|
It invokes the SUTimeWrapper.annotate() function in Java. |
|
124
|
|
|
|
|
125
|
|
|
Args: |
|
126
|
|
|
input_str (str): The input as string that has to be parsed. |
|
127
|
|
|
reference_date (Optional[str]): Optional reference data for SUTime. |
|
128
|
|
|
Defaults to `''`. |
|
129
|
|
|
|
|
130
|
|
|
Returns: |
|
131
|
|
|
A list of dicts with the result from the SUTimeWrapper.annotate() |
|
132
|
|
|
call. |
|
133
|
|
|
|
|
134
|
|
|
Raises: |
|
135
|
|
|
RuntimeError: An error occurs when CoreNLP is not loaded. |
|
136
|
|
|
""" |
|
137
|
|
|
if self._is_loaded is False: |
|
138
|
|
|
raise RuntimeError('Please load SUTime first!') |
|
139
|
|
|
|
|
140
|
|
|
if reference_date: |
|
141
|
|
|
return json.loads(str(self._sutime.annotate( |
|
142
|
|
|
input_str, reference_date, |
|
143
|
|
|
))) |
|
144
|
|
|
return json.loads(str(self._sutime.annotate(input_str))) |
|
145
|
|
|
|
|
146
|
|
|
def _check_language_model_dependency(self, language: str): |
|
147
|
|
|
if language not in self._languages: |
|
148
|
|
|
raise RuntimeError('Unsupported language: {0}'.format(language)) |
|
149
|
|
|
normalized_language = self._languages[language] |
|
150
|
|
|
|
|
151
|
|
|
if normalized_language not in self._supported_languages: |
|
152
|
|
|
logging.warning('{0}: {1}. {2}.'.format( |
|
153
|
|
|
normalized_language.capitalize(), |
|
154
|
|
|
'is not (yet) supported by SUTime', |
|
155
|
|
|
'Falling back to default model', |
|
156
|
|
|
)) |
|
157
|
|
|
return |
|
158
|
|
|
|
|
159
|
|
|
language_model_file = ( |
|
160
|
|
|
self.jars / 'stanford-corenlp-{0}-models-{1}.jar'.format( |
|
161
|
|
|
self._corenlp_version, |
|
162
|
|
|
normalized_language, |
|
163
|
|
|
)) |
|
164
|
|
|
|
|
165
|
|
|
language_model_file_exists = glob.glob(str(language_model_file)) |
|
166
|
|
|
is_english_language = normalized_language in {'english', 'british'} |
|
167
|
|
|
|
|
168
|
|
|
if not (language_model_file_exists or is_english_language): |
|
169
|
|
|
raise RuntimeError( |
|
170
|
|
|
'Missing language model for {0}! Run {1} {2} {3}'.format( |
|
171
|
|
|
self._languages[language].capitalize(), |
|
172
|
|
|
'mvn dependency:copy-dependencies', |
|
173
|
|
|
'-DoutputDirectory=./sutime/jars -P', |
|
174
|
|
|
self._languages[language], |
|
175
|
|
|
), |
|
176
|
|
|
) |
|
177
|
|
|
|
|
178
|
|
|
def _start_jvm(self, additional_flags: Optional[List[str]]): |
|
179
|
|
|
flags = ['-Djava.class.path={0}'.format(self._classpath)] |
|
180
|
|
|
if additional_flags: |
|
181
|
|
|
flags.extend(additional_flags) |
|
182
|
|
|
logging.info('jpype.isJVMStarted(): {0}'.format(jpype.isJVMStarted())) |
|
183
|
|
|
if not jpype.isJVMStarted(): |
|
184
|
|
|
jpype.startJVM(jpype.getDefaultJVMPath(), *flags) |
|
185
|
|
|
|
|
186
|
|
|
def _create_classpath(self): |
|
187
|
|
|
sutime_jar = ( |
|
188
|
|
|
Path(importlib.util.find_spec('sutime').origin).parent / |
|
189
|
|
|
'jars' / self._sutime_python_jar |
|
190
|
|
|
) |
|
191
|
|
|
jars = [sutime_jar] |
|
192
|
|
|
jar_file_names = [] |
|
193
|
|
|
for top, _, files in os.walk(self.jars): |
|
194
|
|
|
for file_name in files: |
|
195
|
|
|
if file_name.endswith('.jar'): |
|
196
|
|
|
jars.append(Path(top, file_name)) |
|
197
|
|
|
jar_file_names.append(file_name) |
|
198
|
|
|
if not self._required_jars.issubset(jar_file_names): |
|
199
|
|
|
logging.warning([ |
|
200
|
|
|
jar for jar in self._required_jars if jar not in jar_file_names |
|
201
|
|
|
]) |
|
202
|
|
|
raise RuntimeError( |
|
203
|
|
|
'Not all necessary Java dependencies have been downloaded!', |
|
204
|
|
|
) |
|
205
|
|
|
return os.pathsep.join(str(jar) for jar in jars) |
|
206
|
|
|
|