1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
import calendar |
3
|
|
|
import datetime |
4
|
|
|
|
5
|
|
|
from dateutil.relativedelta import relativedelta |
6
|
|
|
from sqlalchemy import func |
7
|
|
|
|
8
|
|
|
from pyjobsweb import model |
9
|
|
|
|
10
|
|
|
|
11
|
|
|
class StatsQuestioner(object): |
12
|
|
|
""" |
13
|
|
|
Provide database query method and query result formatting. |
14
|
|
|
""" |
15
|
|
|
|
16
|
|
|
PERIOD_MONTH = 'mon' |
17
|
|
|
PERIOD_WEEK = 'week' |
18
|
|
|
|
19
|
|
|
FIELD_SOURCE = 0 |
20
|
|
|
FIELD_COUNT = 1 |
21
|
|
|
FIELD_DATE = 2 |
22
|
|
|
|
23
|
|
|
FIELDS = { |
24
|
|
|
FIELD_SOURCE: 'source', |
25
|
|
|
FIELD_COUNT: 'jobs', |
26
|
|
|
FIELD_DATE: 'date' |
27
|
|
|
} |
28
|
|
|
|
29
|
|
|
FLAT_Y_FIELD = FIELD_SOURCE |
30
|
|
|
FLAT_X_FIELD = FIELD_DATE |
31
|
|
|
|
32
|
|
|
@staticmethod |
33
|
|
|
def get_month_period(period_range): |
34
|
|
|
""" |
35
|
|
|
return a start date and a end date of x complete previous month |
36
|
|
|
:param period_range: number of months |
37
|
|
|
:return: date_from (datetime), date_to (datetime) |
38
|
|
|
""" |
39
|
|
|
|
40
|
|
|
today = datetime.datetime.today().replace(hour=23, minute=59, second=59, microsecond=999999) |
41
|
|
|
current_month = today.replace(day=1) |
42
|
|
|
last_day_of_current_month = calendar.monthrange(today.year, today.month)[1] |
43
|
|
|
|
44
|
|
|
if today.day == last_day_of_current_month: |
45
|
|
|
date_from = current_month + relativedelta(months=-period_range - 1) |
46
|
|
|
date_to = today |
47
|
|
|
else: |
48
|
|
|
date_from = current_month + relativedelta(months=-period_range) |
49
|
|
|
date_to = current_month - datetime.timedelta(days=1) |
50
|
|
|
|
51
|
|
|
return date_from, date_to |
52
|
|
|
|
53
|
|
|
@classmethod |
54
|
|
|
def extract(cls, query_result, field): |
55
|
|
|
""" |
56
|
|
|
Simple make a list containing asked field value from query_result |
57
|
|
|
:param query_result: result from StatsQuestioner query |
58
|
|
|
:param field: field to be extracted |
59
|
|
|
:return: list containing asked field value from query_result |
60
|
|
|
""" |
61
|
|
|
return sorted(set([r[field] for r in query_result])) |
62
|
|
|
|
63
|
|
|
@classmethod |
64
|
|
|
def extract_stats(cls, query_result, sources): |
65
|
|
|
""" |
66
|
|
|
Return a dict of stats: { source_name: { datetime: { job_count } } } |
67
|
|
|
:param query_result: result from StatsQuestioner query |
68
|
|
|
:param sources: pyjobs_crawler sources |
69
|
|
|
:return: dict of sources where keys value are dict of date:jobs_count |
70
|
|
|
""" |
71
|
|
|
periods = cls.extract(query_result, cls.FIELD_DATE) |
72
|
|
|
|
73
|
|
|
stats = {} |
74
|
|
|
for source in sources: |
75
|
|
|
stats[source] = {} |
76
|
|
|
for period in periods: |
77
|
|
|
for query_line in query_result: |
78
|
|
|
query_line_source, query_line_jobs_count, query_line_date = query_line |
79
|
|
|
if query_line_source == source \ |
80
|
|
|
and query_line_date == period: |
81
|
|
|
stats[source][period] = query_line_jobs_count |
82
|
|
|
if period not in stats[source]: |
83
|
|
|
stats[source][period] = 0 |
84
|
|
|
|
85
|
|
|
return stats |
86
|
|
|
|
87
|
|
|
@classmethod |
88
|
|
|
def flat_query_by_y(cls, query_result, sources, date_value_callback=lambda date: date.strftime('%Y-%m-%d')): |
89
|
|
|
""" |
90
|
|
|
Return flatted stats: [{date: 'YYYY-mm-dd', source_1: job_count, source_2: job_count, source_...}, ...] |
91
|
|
|
:param query_result: result from StatsQuestioner query |
92
|
|
|
:param sources: pyjobs_crawler sources |
93
|
|
|
:param date_value_callback: callback used to format |
94
|
|
|
:return: list of flatted stats dicts |
95
|
|
|
""" |
96
|
|
|
periods = cls.extract(query_result, cls.FIELD_DATE) |
97
|
|
|
|
98
|
|
|
flat = [] |
99
|
|
|
for period in periods: |
100
|
|
|
flat_line = cls.extract_flat_dict_for_field( |
101
|
|
|
query_result=query_result, |
102
|
|
|
where_field=cls.FLAT_X_FIELD, |
103
|
|
|
where_field_value=period, |
104
|
|
|
flat_field_key=cls.FLAT_Y_FIELD, |
105
|
|
|
flat_field_value=cls.FIELD_COUNT, |
106
|
|
|
sources=sources |
107
|
|
|
) |
108
|
|
|
# Stringify date |
109
|
|
|
flat_line[cls.FIELDS[cls.FIELD_DATE]] = date_value_callback(flat_line[cls.FIELDS[cls.FIELD_DATE]]) |
110
|
|
|
flat.append(flat_line) |
111
|
|
|
|
112
|
|
|
return flat |
113
|
|
|
|
114
|
|
|
@classmethod |
115
|
|
|
def extract_flat_dict_for_field(cls, query_result, where_field, where_field_value, flat_field_key, flat_field_value, |
116
|
|
|
sources): |
117
|
|
|
""" |
118
|
|
|
return a flatted dict based on field correspondence: |
119
|
|
|
{based_field_name: based_field_value, source_1: source_1_value, source_2: source_2_value, ...} |
120
|
|
|
Sources not found in query_result for field correspondence are filled by 0. |
121
|
|
|
:param query_result: result from StatsQuestioner query |
122
|
|
|
:param where_field: based field |
123
|
|
|
:param where_field_value: based field value |
124
|
|
|
:param flat_field_key: field where apply flatting (eg. source) |
125
|
|
|
:param flat_field_value: field to use for value of flat_field_key in dict (eg. count) |
126
|
|
|
:param sources: pyjobs_crawler sources |
127
|
|
|
:return: flatted dict |
128
|
|
|
""" |
129
|
|
|
extracted_dict = {cls.FIELDS[where_field]: where_field_value} |
130
|
|
|
|
131
|
|
|
for result_line in query_result: |
132
|
|
|
if result_line[where_field] == where_field_value: |
133
|
|
|
extracted_dict[result_line[flat_field_key]] = result_line[flat_field_value] |
134
|
|
|
|
135
|
|
|
for source in sources: |
136
|
|
|
if source not in extracted_dict: |
137
|
|
|
extracted_dict[source] = 0 |
138
|
|
|
|
139
|
|
|
return extracted_dict |
140
|
|
|
|
141
|
|
|
def __init__(self, session): |
142
|
|
|
self._session = session |
143
|
|
|
|
144
|
|
|
def by_complete_period(self, period, date_from, date_to): |
145
|
|
|
""" |
146
|
|
|
|
147
|
|
|
:param period: period unit (eg StatsQuestioner.PERIOD_MONTH) |
148
|
|
|
:param date_from: start date of concerned jobs |
149
|
|
|
:param date_to: end date of concerned jobs |
150
|
|
|
:return: sqlalchemy Query |
151
|
|
|
:rtype: sqlalchemy.orm.Query |
152
|
|
|
""" |
153
|
|
|
# TODO - B.S. - 20160204: date_trunc only compatible with postgresql |
154
|
|
|
date_trunc_func = \ |
155
|
|
|
func.date_trunc(period, model.JobAlchemy.publication_datetime) |
156
|
|
|
return self._session.query(model.JobAlchemy.source, |
157
|
|
|
func.count(model.JobAlchemy.id), |
158
|
|
|
date_trunc_func) \ |
159
|
|
|
.filter(model.JobAlchemy.publication_datetime >= date_from) \ |
160
|
|
|
.filter(model.JobAlchemy.publication_datetime <= date_to) \ |
161
|
|
|
.group_by(model.JobAlchemy.source) \ |
162
|
|
|
.group_by(date_trunc_func) \ |
163
|
|
|
.order_by(date_trunc_func) |
164
|
|
|
|