|
1
|
|
|
#!/usr/bin/env python3 |
|
2
|
|
|
|
|
3
|
|
|
import requests |
|
4
|
|
|
import json |
|
5
|
|
|
import pickle |
|
6
|
|
|
import sys |
|
7
|
|
|
from nltk.corpus import wordnet as wn |
|
8
|
|
|
|
|
9
|
|
|
default_language = 'en' |
|
10
|
|
|
|
|
11
|
|
|
def buildWikidataProperties(): |
|
12
|
|
|
""" |
|
13
|
|
|
Return the set of all Wikidata properties. |
|
14
|
|
|
""" |
|
15
|
|
|
properties = set() |
|
16
|
|
|
maxRange=60 |
|
17
|
|
|
for i in range(0, maxRange): |
|
18
|
|
|
print("%d/%d"%(i+1, maxRange)) |
|
19
|
|
|
propertiesIDs = '|'.join(['P%d'%x for x in range(50*i+1, 50*(i+1)+1)]) |
|
20
|
|
|
request = requests.get('http://www.wikidata.org/w/api.php', params={'action':'wbgetentities', 'sites':'itwiki', 'ids':propertiesIDs, 'format':'json'}) |
|
21
|
|
|
j = request.json() |
|
22
|
|
|
if j['success'] == 0: |
|
23
|
|
|
continue |
|
24
|
|
|
for prop in j['entities'].values(): |
|
25
|
|
|
if 'missing' in prop.keys(): |
|
26
|
|
|
continue |
|
27
|
|
|
try: |
|
28
|
|
|
for alias in prop['aliases'][default_language]: |
|
29
|
|
|
properties.add(alias['value']) |
|
30
|
|
|
except KeyError: # no alias |
|
31
|
|
|
pass |
|
32
|
|
|
properties.add(prop['labels'][default_language]['value']) |
|
33
|
|
|
return properties |
|
34
|
|
|
|
|
35
|
|
|
def buildNouns(): |
|
36
|
|
|
""" |
|
37
|
|
|
Returns the set of all nouns of NLTK |
|
38
|
|
|
""" |
|
39
|
|
|
return {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} |
|
40
|
|
|
|
|
41
|
|
|
def buildVerbs(): |
|
42
|
|
|
""" |
|
43
|
|
|
Returns the set of all verbs of NLTK |
|
44
|
|
|
""" |
|
45
|
|
|
return {x.name().split(".", 1)[0] for x in wn.all_synsets("v")} |
|
46
|
|
|
|
|
47
|
|
|
if __name__ == "__main__": |
|
48
|
|
|
if len(sys.argv) != 3: |
|
49
|
|
|
sys.exit("Syntax: ./%s storage_file -<database description> (wiki : Wikidata properties, n : nouns, v : verbs)" % sys.argv[0]) # ex: ./extractors.py file.pkl -wiki |
|
50
|
|
|
data = {} |
|
51
|
|
|
if sys.argv[2] == '-wiki': |
|
52
|
|
|
data = buildWikidataProperties() |
|
53
|
|
|
if sys.argv[2] == '-n': |
|
54
|
|
|
data = buildNouns() |
|
55
|
|
|
if sys.argv[2] == '-v': |
|
56
|
|
|
data = buildVerbs() |
|
57
|
|
|
f = open(sys.argv[1], 'wb') |
|
58
|
|
|
pickle.dump(data, f) |
|
59
|
|
|
f.close() |
|
60
|
|
|
|