1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- encoding: utf-8 -*- |
3
|
|
|
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: |
4
|
|
|
# Author: Binux<[email protected]> |
5
|
|
|
# http://binux.me |
6
|
|
|
# Created on 2016-01-20 20:20:55 |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
import time |
10
|
|
|
import json |
11
|
|
|
|
12
|
|
|
import elasticsearch.helpers |
13
|
|
|
from elasticsearch import Elasticsearch |
14
|
|
|
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB |
15
|
|
|
|
16
|
|
|
|
17
|
|
|
class TaskDB(BaseTaskDB): |
18
|
|
|
__type__ = 'task' |
19
|
|
|
|
20
|
|
|
def __init__(self, hosts, index='pyspider'): |
21
|
|
|
self.index = index |
22
|
|
|
self._changed = False |
23
|
|
|
self.es = Elasticsearch(hosts=hosts) |
24
|
|
|
|
25
|
|
|
self.es.indices.create(index=self.index, ignore=400) |
26
|
|
|
if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): |
27
|
|
|
self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ |
28
|
|
|
"_all": {"enabled": False}, |
29
|
|
|
"properties": { |
30
|
|
|
"project": {"type": "string", "index": "not_analyzed"}, |
31
|
|
|
"status": {"type": "byte"}, |
32
|
|
|
} |
33
|
|
|
}) |
34
|
|
|
|
35
|
|
|
def _parse(self, data): |
36
|
|
|
if not data: |
37
|
|
|
return data |
38
|
|
|
for each in ('schedule', 'fetch', 'process', 'track'): |
39
|
|
|
if each in data: |
40
|
|
|
if data[each]: |
41
|
|
|
data[each] = json.loads(data[each]) |
42
|
|
|
else: |
43
|
|
|
data[each] = {} |
44
|
|
|
return data |
45
|
|
|
|
46
|
|
|
def _stringify(self, data): |
47
|
|
|
for each in ('schedule', 'fetch', 'process', 'track'): |
48
|
|
|
if each in data: |
49
|
|
|
data[each] = json.dumps(data[each]) |
50
|
|
|
return data |
51
|
|
|
|
52
|
|
|
@property |
53
|
|
|
def projects(self): |
54
|
|
|
ret = self.es.search(index=self.index, doc_type=self.__type__, |
55
|
|
|
body={"aggs": {"projects": { |
56
|
|
|
"terms": {"field": "project"} |
57
|
|
|
}}}, _source=False) |
58
|
|
|
return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])] |
59
|
|
|
|
60
|
|
|
def load_tasks(self, status, project=None, fields=None): |
61
|
|
|
self.refresh() |
62
|
|
|
if project is None: |
63
|
|
|
for project in self.projects: |
64
|
|
|
for each in self.load_tasks(status, project, fields): |
65
|
|
|
yield each |
66
|
|
|
else: |
67
|
|
|
for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, |
68
|
|
|
query={'query': {'bool': { |
69
|
|
|
'must': {'term': {'project': project}}, |
70
|
|
|
'filter': {'term': {'status': status}}, |
71
|
|
|
}}}, _source_include=fields or []): |
72
|
|
|
yield self._parse(record['_source']) |
73
|
|
|
|
74
|
|
|
def get_task(self, project, taskid, fields=None): |
75
|
|
|
if self._changed: |
76
|
|
|
self.refresh() |
77
|
|
|
ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid), |
78
|
|
|
_source_include=fields or [], ignore=404) |
79
|
|
|
return self._parse(ret.get('_source', None)) |
80
|
|
|
|
81
|
|
|
def status_count(self, project): |
82
|
|
|
self.refresh() |
83
|
|
|
ret = self.es.search(index=self.index, doc_type=self.__type__, |
84
|
|
|
body={"query": {'term': {'project': project}}, |
85
|
|
|
"aggs": {"status": { |
86
|
|
|
"terms": {"field": "status"} |
87
|
|
|
}}}, _source=False) |
88
|
|
|
result = {} |
89
|
|
|
for each in ret['aggregations']['status'].get('buckets', []): |
90
|
|
|
result[each['key']] = each['doc_count'] |
91
|
|
|
return result |
92
|
|
|
|
93
|
|
|
def insert(self, project, taskid, obj={}): |
94
|
|
|
self._changed = True |
95
|
|
|
obj = dict(obj) |
96
|
|
|
obj['taskid'] = taskid |
97
|
|
|
obj['project'] = project |
98
|
|
|
obj['updatetime'] = time.time() |
99
|
|
|
return self.es.index(index=self.index, doc_type=self.__type__, |
100
|
|
|
body=self._stringify(obj), id='%s:%s' % (project, taskid)) |
101
|
|
|
|
102
|
|
|
def update(self, project, taskid, obj={}, **kwargs): |
103
|
|
|
self._changed = True |
104
|
|
|
obj = dict(obj) |
105
|
|
|
obj.update(kwargs) |
106
|
|
|
obj['updatetime'] = time.time() |
107
|
|
|
return self.es.update(index=self.index, doc_type=self.__type__, id='%s:%s' % (project, taskid), |
108
|
|
|
body={"doc": self._stringify(obj)}, ignore=404) |
109
|
|
|
|
110
|
|
|
def drop(self, project): |
111
|
|
|
self.refresh() |
112
|
|
|
for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, |
113
|
|
|
query={'query': {'term': {'project': project}}}, |
114
|
|
|
_source=False): |
115
|
|
|
self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id']) |
116
|
|
|
self.refresh() |
117
|
|
|
|
118
|
|
|
def refresh(self): |
119
|
|
|
""" |
120
|
|
|
Explicitly refresh one or more index, making all operations |
121
|
|
|
performed since the last refresh available for search. |
122
|
|
|
""" |
123
|
|
|
self._changed = False |
124
|
|
|
self.es.indices.refresh(index=self.index) |
125
|
|
|
|