|
1
|
|
|
#!/usr/bin/env python3 |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
""" |
|
4
|
|
|
Created on Thu Nov 14 16:06:10 2019 |
|
5
|
|
|
|
|
6
|
|
|
@author: Paolo Cozzi <[email protected]> |
|
7
|
|
|
""" |
|
8
|
|
|
|
|
9
|
|
|
import asyncio |
|
10
|
|
|
import aiohttp |
|
11
|
|
|
import requests |
|
12
|
|
|
|
|
13
|
|
|
from yarl import URL |
|
14
|
|
|
from multidict import MultiDict |
|
15
|
|
|
|
|
16
|
|
|
from datetime import timedelta |
|
17
|
|
|
from celery.utils.log import get_task_logger |
|
18
|
|
|
from django.utils import timezone |
|
19
|
|
|
from django.utils.dateparse import parse_date |
|
20
|
|
|
|
|
21
|
|
|
from common.constants import COMPLETED, BIOSAMPLE_URL |
|
22
|
|
|
from common.helpers import format_attribute |
|
23
|
|
|
from common.tasks import BaseTask, NotifyAdminTaskMixin, exclusive_task |
|
24
|
|
|
from image.celery import app as celery_app |
|
25
|
|
|
from uid.models import Animal as UIDAnimal, Sample as UIDSample, DictSpecie |
|
26
|
|
|
|
|
27
|
|
|
from ..helpers import get_manager_auth |
|
28
|
|
|
from ..models import Submission, OrphanSample, ManagedTeam |
|
29
|
|
|
|
|
30
|
|
|
# Get an instance of a logger |
|
31
|
|
|
logger = get_task_logger(__name__) |
|
32
|
|
|
|
|
33
|
|
|
# defining constants. Clean biosample database data after |
|
34
|
|
|
CLEANUP_DAYS = 30 |
|
35
|
|
|
|
|
36
|
|
|
# this is the timedelta which I want to add to relaseDate to remove samples |
|
37
|
|
|
RELEASE_TIMEDELTA = timedelta(days=365*1000) |
|
38
|
|
|
|
|
39
|
|
|
# Setting page size for biosample requests |
|
40
|
|
|
PAGE_SIZE = 20 |
|
41
|
|
|
|
|
42
|
|
|
PARAMS = MultiDict([ |
|
43
|
|
|
('size', PAGE_SIZE), |
|
44
|
|
|
('filter', 'attr:project:IMAGE'), |
|
45
|
|
|
]) |
|
46
|
|
|
HEADERS = { |
|
47
|
|
|
'Accept': 'application/hal+json', |
|
48
|
|
|
} |
|
49
|
|
|
|
|
50
|
|
|
|
|
51
|
|
|
class CleanUpTask(NotifyAdminTaskMixin, BaseTask): |
|
52
|
|
|
"""Perform biosample.models cleanup by selecting old completed submission |
|
53
|
|
|
and remove them from database""" |
|
54
|
|
|
|
|
55
|
|
|
name = "Clean biosample models" |
|
56
|
|
|
description = """Clean biosample models""" |
|
57
|
|
|
|
|
58
|
|
|
@exclusive_task(task_name="Clean biosample models", lock_id="CleanUpTask") |
|
59
|
|
|
def run(self): |
|
60
|
|
|
""" |
|
61
|
|
|
This function is called when delay is called. It will acquire a lock |
|
62
|
|
|
in redis, so those tasks are mutually exclusive |
|
63
|
|
|
|
|
64
|
|
|
Returns: |
|
65
|
|
|
str: success if everything is ok. Different messages if task is |
|
66
|
|
|
already running or exception is caught""" |
|
67
|
|
|
|
|
68
|
|
|
logger.info("Clean biosample.database started") |
|
69
|
|
|
|
|
70
|
|
|
# get an interval starting from now |
|
71
|
|
|
interval = timezone.now() - timedelta(days=CLEANUP_DAYS) |
|
72
|
|
|
|
|
73
|
|
|
# select all COMPLETED object older than interval |
|
74
|
|
|
qs = Submission.objects.filter( |
|
75
|
|
|
updated_at__lt=interval, |
|
76
|
|
|
status=COMPLETED) |
|
77
|
|
|
|
|
78
|
|
|
logger.info( |
|
79
|
|
|
"Deleting %s biosample.models.Submission objects" % qs.count()) |
|
80
|
|
|
|
|
81
|
|
|
# delete all old objects |
|
82
|
|
|
qs.delete() |
|
83
|
|
|
|
|
84
|
|
|
# debug |
|
85
|
|
|
logger.info("Clean biosample.database completed") |
|
86
|
|
|
|
|
87
|
|
|
return "success" |
|
88
|
|
|
|
|
89
|
|
|
|
|
90
|
|
|
async def fetch(session, url=BIOSAMPLE_URL, params=PARAMS): |
|
91
|
|
|
"""Get a page from biosamples""" |
|
92
|
|
|
|
|
93
|
|
|
# define a URL with yarl |
|
94
|
|
|
url = URL(url) |
|
95
|
|
|
url = url.update_query(params) |
|
96
|
|
|
|
|
97
|
|
|
async with session.get(url, headers=HEADERS) as response: |
|
98
|
|
|
return await response.json() |
|
99
|
|
|
|
|
100
|
|
|
|
|
101
|
|
|
async def parse_samples_data(data, managed_domains): |
|
102
|
|
|
# get samples objects |
|
103
|
|
|
try: |
|
104
|
|
|
samples = data['_embedded']['samples'] |
|
105
|
|
|
|
|
106
|
|
|
for sample in samples: |
|
107
|
|
|
# filter out unmanaged records |
|
108
|
|
|
if sample['domain'] not in managed_domains: |
|
109
|
|
|
logger.warning("Ignoring %s" % (sample['name'])) |
|
110
|
|
|
continue |
|
111
|
|
|
|
|
112
|
|
|
# otherwise return to the caller the sample |
|
113
|
|
|
yield sample |
|
114
|
|
|
|
|
115
|
|
|
except KeyError as exc: |
|
116
|
|
|
# logger exception. With repr() the exception name is rendered |
|
117
|
|
|
logger.error(repr(exc)) |
|
118
|
|
|
logger.warning("error while parsing samples") |
|
119
|
|
|
logger.warning(data) |
|
120
|
|
|
|
|
121
|
|
|
|
|
122
|
|
|
async def get_samples( |
|
123
|
|
|
url=BIOSAMPLE_URL, |
|
124
|
|
|
params=PARAMS, |
|
125
|
|
|
managed_domains=[]): |
|
126
|
|
|
async with aiohttp.ClientSession() as session: |
|
127
|
|
|
data = await fetch(session, url, params) |
|
128
|
|
|
|
|
129
|
|
|
# process data and filter samples I own |
|
130
|
|
|
# https://stackoverflow.com/a/47378063 |
|
131
|
|
|
async for sample in parse_samples_data(data, managed_domains): |
|
132
|
|
|
yield sample |
|
133
|
|
|
|
|
134
|
|
|
tasks = [] |
|
135
|
|
|
|
|
136
|
|
|
# get pages |
|
137
|
|
|
totalPages = data['page']['totalPages'] |
|
138
|
|
|
|
|
139
|
|
|
# generate new awaitable objects |
|
140
|
|
|
for page in range(1, totalPages): |
|
141
|
|
|
# get a new param object to edit |
|
142
|
|
|
my_params = params.copy() |
|
143
|
|
|
|
|
144
|
|
|
# edit a multidict object |
|
145
|
|
|
my_params.update(page=page) |
|
146
|
|
|
|
|
147
|
|
|
# track the new awaitable object |
|
148
|
|
|
tasks.append(fetch(session, url, my_params)) |
|
149
|
|
|
|
|
150
|
|
|
# Run awaitable objects in the aws set concurrently. |
|
151
|
|
|
# Return an iterator of Future objects. |
|
152
|
|
|
for task in asyncio.as_completed(tasks): |
|
153
|
|
|
# read data |
|
154
|
|
|
data = await task |
|
155
|
|
|
|
|
156
|
|
|
# process data and filter samples I own |
|
157
|
|
|
# https://stackoverflow.com/a/47378063 |
|
158
|
|
|
async for sample in parse_samples_data(data, managed_domains): |
|
159
|
|
|
yield sample |
|
160
|
|
|
|
|
161
|
|
|
|
|
162
|
|
|
async def check_samples(): |
|
163
|
|
|
# I need an pyUSIrest.auth.Auth object to filter out records that don't |
|
164
|
|
|
# belong to me |
|
165
|
|
|
auth = get_manager_auth() |
|
166
|
|
|
managed_domains = auth.get_domains() |
|
167
|
|
|
|
|
168
|
|
|
async for sample in get_samples(managed_domains=managed_domains): |
|
169
|
|
|
check_orphan_sample(sample) |
|
170
|
|
|
|
|
171
|
|
|
|
|
172
|
|
|
def check_orphan_sample(sample): |
|
173
|
|
|
animal_qs = UIDAnimal.objects.filter( |
|
174
|
|
|
biosample_id=sample['accession']) |
|
175
|
|
|
|
|
176
|
|
|
sample_qs = UIDSample.objects.filter( |
|
177
|
|
|
biosample_id=sample['accession']) |
|
178
|
|
|
|
|
179
|
|
|
if animal_qs.exists() or sample_qs.exists(): |
|
180
|
|
|
logger.debug("Sample %s is tracked in UID" % (sample['accession'])) |
|
181
|
|
|
|
|
182
|
|
|
else: |
|
183
|
|
|
# get a managed team |
|
184
|
|
|
team = ManagedTeam.objects.get(name=sample["domain"]) |
|
185
|
|
|
|
|
186
|
|
|
# Create an orphan sample |
|
187
|
|
|
orphan, created = OrphanSample.objects.get_or_create( |
|
188
|
|
|
biosample_id=sample['accession'], |
|
189
|
|
|
name=sample['name'], |
|
190
|
|
|
team=team) |
|
191
|
|
|
|
|
192
|
|
|
if created: |
|
193
|
|
|
logger.warning("Add %s to orphan samples" % sample['accession']) |
|
194
|
|
|
|
|
195
|
|
|
|
|
196
|
|
|
class SearchOrphanTask(NotifyAdminTaskMixin, BaseTask): |
|
197
|
|
|
"""Search accross biosamples for objects not present in UID""" |
|
198
|
|
|
|
|
199
|
|
|
name = "Search Orphan BioSamples IDs" |
|
200
|
|
|
description = """Track BioSamples IDs not present in UID""" |
|
201
|
|
|
|
|
202
|
|
|
@exclusive_task( |
|
203
|
|
|
task_name=name, lock_id="SearchOrphanTask") |
|
204
|
|
|
def run(self): |
|
205
|
|
|
""" |
|
206
|
|
|
This function is called when delay is called. It will acquire a lock |
|
207
|
|
|
in redis, so those tasks are mutually exclusive |
|
208
|
|
|
|
|
209
|
|
|
Returns: |
|
210
|
|
|
str: success if everything is ok. Different messages if task is |
|
211
|
|
|
already running or exception is caught""" |
|
212
|
|
|
|
|
213
|
|
|
logger.info("%s started" % (self.name)) |
|
214
|
|
|
|
|
215
|
|
|
# create a loop object |
|
216
|
|
|
loop = asyncio.get_event_loop() |
|
217
|
|
|
|
|
218
|
|
|
# execute stuff |
|
219
|
|
|
try: |
|
220
|
|
|
loop.run_until_complete(check_samples()) |
|
221
|
|
|
|
|
222
|
|
|
finally: |
|
223
|
|
|
# close loop |
|
224
|
|
|
loop.close() |
|
225
|
|
|
|
|
226
|
|
|
# debug |
|
227
|
|
|
logger.info("%s completed" % (self.name)) |
|
228
|
|
|
|
|
229
|
|
|
return "success" |
|
230
|
|
|
|
|
231
|
|
|
|
|
232
|
|
|
def purge_orphan_samples(): |
|
233
|
|
|
"""A function to remove objects from OrphanSample table""" |
|
234
|
|
|
|
|
235
|
|
|
with requests.Session() as session: |
|
236
|
|
|
for orphan_sample in OrphanSample.objects.filter( |
|
237
|
|
|
ignore=False, removed=False): |
|
238
|
|
|
|
|
239
|
|
|
# define the url I need to check |
|
240
|
|
|
url = "/".join([BIOSAMPLE_URL, orphan_sample.biosample_id]) |
|
241
|
|
|
|
|
242
|
|
|
# read data from url |
|
243
|
|
|
response = session.get(url) |
|
244
|
|
|
data = response.json() |
|
245
|
|
|
|
|
246
|
|
|
# I need a new data dictionary to submit |
|
247
|
|
|
new_data = dict() |
|
248
|
|
|
|
|
249
|
|
|
# I suppose the accession exists, since I found this sample |
|
250
|
|
|
# using accession [biosample.id] |
|
251
|
|
|
new_data['accession'] = data.get( |
|
252
|
|
|
'accession', orphan_sample.biosample_id) |
|
253
|
|
|
|
|
254
|
|
|
new_data['alias'] = data['name'] |
|
255
|
|
|
|
|
256
|
|
|
new_data['title'] = data['characteristics']['title'][0]['text'] |
|
257
|
|
|
|
|
258
|
|
|
# this will be the most important attribute |
|
259
|
|
|
new_data['releaseDate'] = str( |
|
260
|
|
|
parse_date(data['releaseDate']) + RELEASE_TIMEDELTA) |
|
261
|
|
|
|
|
262
|
|
|
new_data['taxonId'] = data['taxId'] |
|
263
|
|
|
|
|
264
|
|
|
# need to determine taxon as |
|
265
|
|
|
new_data['taxon'] = DictSpecie.objects.get( |
|
266
|
|
|
term__endswith=data['taxId']).label |
|
267
|
|
|
|
|
268
|
|
|
new_data['attributes'] = dict() |
|
269
|
|
|
|
|
270
|
|
|
# set project again |
|
271
|
|
|
new_data['attributes']["Project"] = format_attribute( |
|
272
|
|
|
value="IMAGE") |
|
273
|
|
|
|
|
274
|
|
|
# return new biosample data |
|
275
|
|
|
yield new_data |
|
276
|
|
|
|
|
277
|
|
|
|
|
278
|
|
|
# register explicitly tasks |
|
279
|
|
|
# https://github.com/celery/celery/issues/3744#issuecomment-271366923 |
|
280
|
|
|
celery_app.tasks.register(CleanUpTask) |
|
281
|
|
|
celery_app.tasks.register(SearchOrphanTask) |
|
282
|
|
|
|