1
|
|
|
#!/usr/bin/env python3 |
2
|
|
|
|
3
|
|
|
import requests |
4
|
|
|
import json |
5
|
|
|
import pickle |
6
|
|
|
import sys |
7
|
|
|
from nltk.corpus import wordnet as wn |
8
|
|
|
|
9
|
|
|
default_language = 'en' |
10
|
|
|
|
11
|
|
|
def buildWikidataProperties(): |
12
|
|
|
""" |
13
|
|
|
Return the set of all Wikidata properties. |
14
|
|
|
""" |
15
|
|
|
properties = set() |
16
|
|
|
maxRange=60 |
17
|
|
|
for i in range(0, maxRange): |
18
|
|
|
print("%d/%d"%(i+1, maxRange)) |
19
|
|
|
propertiesIDs = '|'.join(['P%d'%x for x in range(50*i+1, 50*(i+1)+1)]) |
20
|
|
|
request = requests.get('http://www.wikidata.org/w/api.php', params={'action':'wbgetentities', 'sites':'itwiki', 'ids':propertiesIDs, 'format':'json'}) |
21
|
|
|
j = request.json() |
22
|
|
|
if j['success'] == 0: |
23
|
|
|
continue |
24
|
|
|
for prop in j['entities'].values(): |
25
|
|
|
if 'missing' in prop.keys(): |
26
|
|
|
continue |
27
|
|
|
try: |
28
|
|
|
for alias in prop['aliases'][default_language]: |
29
|
|
|
properties.add(alias['value']) |
30
|
|
|
except KeyError: # no alias |
31
|
|
|
pass |
32
|
|
|
properties.add(prop['labels'][default_language]['value']) |
33
|
|
|
return properties |
34
|
|
|
|
35
|
|
|
def buildNouns(): |
36
|
|
|
""" |
37
|
|
|
Returns the set of all nouns of NLTK |
38
|
|
|
""" |
39
|
|
|
return {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} |
40
|
|
|
|
41
|
|
|
def buildVerbs(): |
42
|
|
|
""" |
43
|
|
|
Returns the set of all verbs of NLTK |
44
|
|
|
""" |
45
|
|
|
return {x.name().split(".", 1)[0] for x in wn.all_synsets("v")} |
46
|
|
|
|
47
|
|
|
if __name__ == "__main__": |
48
|
|
|
if len(sys.argv) != 3: |
49
|
|
|
sys.exit("Syntax: ./%s storage_file -<database description> (wiki : Wikidata properties, n : nouns, v : verbs)" % sys.argv[0]) # ex: ./extractors.py file.pkl -wiki |
50
|
|
|
data = {} |
51
|
|
|
if sys.argv[2] == '-wiki': |
52
|
|
|
data = buildWikidataProperties() |
53
|
|
|
if sys.argv[2] == '-n': |
54
|
|
|
data = buildNouns() |
55
|
|
|
if sys.argv[2] == '-v': |
56
|
|
|
data = buildVerbs() |
57
|
|
|
f = open(sys.argv[1], 'wb') |
58
|
|
|
pickle.dump(data, f) |
59
|
|
|
f.close() |
60
|
|
|
|