1
|
|
|
""" |
2
|
|
|
Command-line interface for mandos. |
3
|
|
|
""" |
4
|
|
|
|
5
|
|
|
from __future__ import annotations |
6
|
|
|
|
7
|
|
|
from pathlib import Path |
8
|
|
|
from typing import Optional, List |
9
|
|
|
|
10
|
|
|
import typer |
|
|
|
|
11
|
|
|
|
12
|
|
|
from mandos import logger, MANDOS_SETUP |
13
|
|
|
from mandos.analysis.io_defns import SimilarityDfLongForm, SimilarityDfShortForm |
|
|
|
|
14
|
|
|
from mandos.analysis.concordance import ConcordanceCalculation |
15
|
|
|
from mandos.analysis.distances import MatrixCalculation |
16
|
|
|
from mandos.analysis.filtration import Filtration |
17
|
|
|
from mandos.analysis.enrichment import EnrichmentCalculation, RealAlg, BoolAlg |
18
|
|
|
from mandos.analysis.io_defns import ScoreDf |
19
|
|
|
from mandos.analysis.prepping import MatrixPrep |
20
|
|
|
from mandos.analysis.projection import UmapCalc |
|
|
|
|
21
|
|
|
from mandos.analysis.reification import Reifier |
22
|
|
|
from mandos.entries.common_args import Arg, CommonArgs |
23
|
|
|
from mandos.entries.common_args import CommonArgs as Ca |
|
|
|
|
24
|
|
|
from mandos.entries.common_args import Opt |
25
|
|
|
from mandos.entries.multi_searches import MultiSearch |
26
|
|
|
from mandos.entries.searcher import SearcherUtils, InputFrame, CompoundIdFiller, IdMatchFrame |
27
|
|
|
from mandos.model import START_TIMESTAMP, MiscUtils |
28
|
|
|
from mandos.model.hits import HitFrame |
29
|
|
|
from mandos.model.settings import MANDOS_SETTINGS |
30
|
|
|
from mandos.model.taxonomy_caches import TaxonomyFactories |
31
|
|
|
from mandos.analysis.projection import UMAP |
32
|
|
|
from mandos.model.rdkit_utils import RdkitUtils, Fingerprint |
|
|
|
|
33
|
|
|
|
34
|
|
|
set_up = MANDOS_SETUP |
35
|
|
|
DEF_SUFFIX = MANDOS_SETTINGS.default_table_suffix |
36
|
|
|
|
37
|
|
|
if UMAP is None: |
38
|
|
|
_umap_params = {} |
39
|
|
|
else: |
40
|
|
|
_umap_params = { |
41
|
|
|
k: v |
42
|
|
|
for k, v in UMAP().get_params(deep=False).items() |
43
|
|
|
if k not in {"random_state", "metric"} |
44
|
|
|
} |
45
|
|
|
|
46
|
|
|
|
47
|
|
|
class MiscCommands: |
|
|
|
|
48
|
|
|
@staticmethod |
49
|
|
|
def search( |
|
|
|
|
50
|
|
|
path: Path = Ca.compounds, |
|
|
|
|
51
|
|
|
config: Path = Arg.in_file( |
|
|
|
|
52
|
|
|
r""" |
53
|
|
|
TOML config file. See docs. |
54
|
|
|
""" |
55
|
|
|
), |
56
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
57
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
58
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
59
|
|
|
out_dir: Path = Ca.out_dir, |
|
|
|
|
60
|
|
|
) -> None: |
61
|
|
|
""" |
62
|
|
|
Run multiple searches. |
63
|
|
|
""" |
64
|
|
|
set_up(log, quiet, verbose) |
65
|
|
|
MultiSearch.build(path, out_dir, config).run() |
66
|
|
|
|
67
|
|
|
@staticmethod |
68
|
|
|
def serve( |
|
|
|
|
69
|
|
|
port: int = Opt.val(r"Port to serve on", default=1540), |
|
|
|
|
70
|
|
|
db: str = Opt.val("Name of the MySQL database", default="mandos"), |
|
|
|
|
71
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
72
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
73
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
74
|
|
|
) -> None: |
75
|
|
|
r""" |
76
|
|
|
Start a REST server. |
77
|
|
|
|
78
|
|
|
The connection information is stored in your global settings file. |
79
|
|
|
""" |
80
|
|
|
set_up(log, quiet, verbose) |
81
|
|
|
|
82
|
|
|
@staticmethod |
83
|
|
|
def deposit( |
|
|
|
|
84
|
|
|
path: Path = Ca.file_input, |
|
|
|
|
85
|
|
|
db: str = Opt.val(r"Name of the MySQL database", default="mandos"), |
|
|
|
|
86
|
|
|
host: str = Opt.val( |
|
|
|
|
87
|
|
|
r"Database hostname (ignored if ``--socket`` is passed", default="127.0.0.1" |
88
|
|
|
), |
89
|
|
|
socket: Optional[str] = Opt.val("Path to a Unix socket (if set, ``--host`` is ignored)"), |
|
|
|
|
90
|
|
|
user: Optional[str] = Opt.val("Database username (empty if not set)"), |
|
|
|
|
91
|
|
|
password: Optional[str] = Opt.val("Database password (empty if not set)"), |
|
|
|
|
92
|
|
|
as_of: Optional[str] = CommonArgs.as_of, |
|
|
|
|
93
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
94
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
95
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
96
|
|
|
) -> None: |
97
|
|
|
r""" |
98
|
|
|
Export to a relational database. |
99
|
|
|
|
100
|
|
|
Saves data from Mandos search commands to a database for serving via REST. |
101
|
|
|
|
102
|
|
|
See also: ``:serve``. |
103
|
|
|
""" |
104
|
|
|
set_up(log, quiet, verbose) |
105
|
|
|
|
106
|
|
|
@staticmethod |
107
|
|
|
def fill( |
|
|
|
|
108
|
|
|
path: Path = Ca.compounds_to_fill, |
|
|
|
|
109
|
|
|
to: Path = Ca.id_table_to, |
|
|
|
|
110
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
111
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
112
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
113
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
114
|
|
|
) -> None: |
115
|
|
|
r""" |
116
|
|
|
Match IDs; fetch and cache compound data. |
117
|
|
|
|
118
|
|
|
Useful to check what you can see before running a search. |
119
|
|
|
""" |
120
|
|
|
set_up(log, quiet, verbose) |
121
|
|
|
default = str(path) + "-ids" + START_TIMESTAMP + DEF_SUFFIX |
122
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
123
|
|
|
df = IdMatchFrame.read_file(path) |
|
|
|
|
124
|
|
|
df = CompoundIdFiller.fill(df) |
|
|
|
|
125
|
|
|
df.write_file(to) |
126
|
|
|
typer.echo(f"Wrote to {to}") |
127
|
|
|
|
128
|
|
|
@staticmethod |
129
|
|
|
def cache( |
|
|
|
|
130
|
|
|
path: Path = Ca.compounds, |
|
|
|
|
131
|
|
|
no_pubchem: bool = Opt.flag(r"Do not download data from PubChem"), |
|
|
|
|
132
|
|
|
no_chembl: bool = Opt.flag(r"Do not fetch IDs from ChEMBL"), |
|
|
|
|
133
|
|
|
no_hmdb: bool = Opt.flag(r"Do not download data from HMDB"), |
|
|
|
|
134
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
135
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
136
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
137
|
|
|
) -> None: |
138
|
|
|
r""" |
139
|
|
|
Fetch and cache compound data. |
140
|
|
|
|
141
|
|
|
Useful to freeze data before running a search. |
142
|
|
|
""" |
143
|
|
|
set_up(log, quiet, verbose) |
144
|
|
|
inchikeys = SearcherUtils.read(path) |
145
|
|
|
SearcherUtils.dl(inchikeys, pubchem=not no_pubchem, chembl=not no_chembl, hmdb=not no_hmdb) |
146
|
|
|
typer.echo(f"Done caching.") |
|
|
|
|
147
|
|
|
|
148
|
|
|
@staticmethod |
149
|
|
|
def build_taxonomy( |
|
|
|
|
150
|
|
|
taxa: str = Ca.taxa, |
|
|
|
|
151
|
|
|
forbid: str = Opt.val( |
|
|
|
|
152
|
|
|
r"""Exclude descendents of these taxa IDs or names (comma-separated).""", default="" |
153
|
|
|
), |
154
|
|
|
to: Path = typer.Option( |
|
|
|
|
155
|
|
|
None, |
156
|
|
|
help=rf""" |
157
|
|
|
Where to export. |
158
|
|
|
|
159
|
|
|
{Ca.output_formats} |
160
|
|
|
|
161
|
|
|
[default: ./<taxa>-<datetime>.{DEF_SUFFIX}] |
162
|
|
|
""", |
163
|
|
|
), |
164
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
165
|
|
|
in_cache: bool = CommonArgs.in_cache, |
|
|
|
|
166
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
167
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
168
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
169
|
|
|
): |
170
|
|
|
""" |
171
|
|
|
Export a taxonomic tree to a table. |
172
|
|
|
|
173
|
|
|
Writes a taxonomy of given taxa and their descendants to a table. |
174
|
|
|
""" |
175
|
|
|
set_up(log, quiet, verbose) |
176
|
|
|
concat = taxa + "-" + forbid |
177
|
|
|
taxa = Ca.parse_taxa(taxa) |
178
|
|
|
forbid = Ca.parse_taxa(forbid) |
179
|
|
|
default = concat + "-" + START_TIMESTAMP + DEF_SUFFIX |
180
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
181
|
|
|
my_tax = TaxonomyFactories.get_smart_taxonomy(taxa, forbid) |
182
|
|
|
my_tax = my_tax.to_df() |
183
|
|
|
to.parent.mkdir(exist_ok=True, parents=True) |
184
|
|
|
my_tax.write_file(to) |
185
|
|
|
|
186
|
|
|
@staticmethod |
187
|
|
|
def dl_tax( |
188
|
|
|
taxa: str = Opt.val( |
|
|
|
|
189
|
|
|
r""" |
190
|
|
|
Either "vertebrata", "all", or a comma-separated list of UniProt taxon IDs. |
191
|
|
|
|
192
|
|
|
"all" is only valid when --replace is passed; |
193
|
|
|
this will regenerate all taxonomy files that are found in the cache. |
194
|
|
|
""", |
195
|
|
|
default="", |
196
|
|
|
), |
197
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
198
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
199
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
200
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
201
|
|
|
) -> None: |
202
|
|
|
""" |
203
|
|
|
Prep a new taxonomy file for use in mandos. |
204
|
|
|
|
205
|
|
|
With --replace set, will delete any existing file. |
206
|
|
|
This can be useful to make sure your cached taxonomy is up-to-date before running. |
207
|
|
|
|
208
|
|
|
Downloads and converts a tab-separated file from UniProt. |
209
|
|
|
(To find manually, follow the ``All lower taxonomy nodes`` link and click ``Download``.) |
210
|
|
|
Then applies fixes and reduces the file size, creating a new file alongside. |
211
|
|
|
Puts both the raw data and fixed data in the cache under ``~/.mandos/taxonomy/``. |
212
|
|
|
""" |
213
|
|
|
if taxa == "": |
214
|
|
|
logger.info("No taxa were specified. No data downloaded.") |
215
|
|
|
return |
216
|
|
|
if ( |
217
|
|
|
taxa not in ["all", "vertebrata"] |
|
|
|
|
218
|
|
|
and not taxa.replace(",", "").replace(" ", "").isdigit() |
|
|
|
|
219
|
|
|
): |
220
|
|
|
raise ValueError(f"Use either 'all', 'vertebrata', or a UniProt taxon ID") |
|
|
|
|
221
|
|
|
if taxa == "all" and not replace: |
222
|
|
|
raise ValueError(f"Use --replace with taxon 'all'") |
|
|
|
|
223
|
|
|
set_up(log, quiet, verbose) |
224
|
|
|
factory = TaxonomyFactories.from_uniprot() |
225
|
|
|
if taxa == "all" and replace: |
226
|
|
|
listed = TaxonomyFactories.list_cached_files() |
227
|
|
|
for p in listed.values(): |
|
|
|
|
228
|
|
|
p.unlink() |
229
|
|
|
factory.rebuild_vertebrata() |
230
|
|
|
for t in listed.keys(): |
|
|
|
|
231
|
|
|
factory.load_dl(t) |
232
|
|
|
elif taxa == "vertebrata" and (replace or not factory.resolve_path(7742).exists()): |
233
|
|
|
factory.rebuild_vertebrata() |
234
|
|
|
elif taxa == "vertebrata": |
235
|
|
|
factory.load_vertebrate(7742) # should usually do nothing |
236
|
|
|
else: |
237
|
|
|
for taxon in [int(t.strip()) for t in taxa.split(",")]: |
238
|
|
|
factory.delete_exact(taxon) |
239
|
|
|
|
240
|
|
|
@staticmethod |
241
|
|
|
def concat( |
|
|
|
|
242
|
|
|
path: Path = Ca.input_dir, |
|
|
|
|
243
|
|
|
to: Optional[Path] = Ca.to_single, |
|
|
|
|
244
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
245
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
246
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
247
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
248
|
|
|
) -> None: |
249
|
|
|
r""" |
250
|
|
|
Concatenate Mandos annotation files into one. |
251
|
|
|
|
252
|
|
|
Note that ``:search`` automatically performs this; |
253
|
|
|
this is needed only if you want to combine results from multiple independent searches. |
254
|
|
|
""" |
255
|
|
|
set_up(log, quiet, verbose) |
256
|
|
|
default = path / ("concat" + DEF_SUFFIX) |
257
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
258
|
|
|
for found in path.iterdir(): |
|
|
|
|
259
|
|
|
pass |
260
|
|
|
|
261
|
|
|
@staticmethod |
262
|
|
|
def filter( |
|
|
|
|
263
|
|
|
path: Path = Ca.to_single, |
|
|
|
|
264
|
|
|
by: Optional[Path] = Arg.in_file( |
|
|
|
|
265
|
|
|
r""" |
266
|
|
|
Path to a TOML (.toml) file containing filters. |
267
|
|
|
|
268
|
|
|
The file contains a list of ``mandos.filter`` keys, |
269
|
|
|
each containing an expression on a single column. |
270
|
|
|
This is only meant for simple, quick-and-dirty filtration. |
271
|
|
|
|
272
|
|
|
See the docs for more info. |
273
|
|
|
""" |
274
|
|
|
), |
275
|
|
|
to: Optional[Path] = Ca.to_single, |
|
|
|
|
276
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
277
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
278
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
279
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
280
|
|
|
) -> None: |
281
|
|
|
""" |
282
|
|
|
Filters by simple expressions. |
283
|
|
|
""" |
284
|
|
|
set_up(log, quiet, verbose) |
285
|
|
|
default = str(path) + "-filter-" + by.stem + DEF_SUFFIX |
286
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
287
|
|
|
df = HitFrame.read_file(path) |
|
|
|
|
288
|
|
|
Filtration.from_file(by).apply(df).write_file(to) |
289
|
|
|
|
290
|
|
|
@staticmethod |
291
|
|
|
def state( |
|
|
|
|
292
|
|
|
path: Path = Ca.file_input, |
|
|
|
|
293
|
|
|
to: Optional[Path] = Opt.out_path( |
|
|
|
|
294
|
|
|
""" |
295
|
|
|
Path to the output file. |
296
|
|
|
|
297
|
|
|
Valid formats and filename suffixes are .nt and .txt with an optional .gz, .zip, or .xz. |
298
|
|
|
If only a filename suffix is provided, will use that suffix with the default directory. |
299
|
|
|
If no suffix is provided, will interpret the path as a directory and use the default filename. |
|
|
|
|
300
|
|
|
Will fail if the file exists and ``--replace`` is not set. |
301
|
|
|
|
302
|
|
|
[default: <path>-statements.nt] |
303
|
|
|
""" |
304
|
|
|
), |
305
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
306
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
307
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
308
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
309
|
|
|
) -> None: |
310
|
|
|
""" |
311
|
|
|
Output simple N-triples statements. |
312
|
|
|
|
313
|
|
|
Each statement is of this form, where the InChI Key refers to the input data: |
314
|
|
|
|
315
|
|
|
`"InChI Key" "predicate" "object" .` |
316
|
|
|
""" |
317
|
|
|
set_up(log, quiet, verbose) |
318
|
|
|
default = f"{path}-statements.nt" |
319
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
320
|
|
|
hits = HitFrame.read_file(path).to_hits() |
321
|
|
|
with to.open() as f: |
|
|
|
|
322
|
|
|
for hit in hits: |
323
|
|
|
f.write(hit.to_triple.n_triples) |
324
|
|
|
|
325
|
|
|
@staticmethod |
326
|
|
|
def reify( |
|
|
|
|
327
|
|
|
path: Path = Ca.file_input, |
|
|
|
|
328
|
|
|
to: Optional[Path] = Opt.out_path( |
|
|
|
|
329
|
|
|
r""" |
330
|
|
|
Path to the output file. |
331
|
|
|
|
332
|
|
|
The filename suffix should be either .nt (N-triples) or .ttl (Turtle), |
333
|
|
|
with an optional .gz, .zip, or .xz. |
334
|
|
|
If only a filename suffix is provided, will use that suffix with the default directory. |
335
|
|
|
If no suffix is provided, will interpret the path as a directory but use the default filename. |
|
|
|
|
336
|
|
|
Will fail if the file exists and ``--replace`` is not set. |
337
|
|
|
|
338
|
|
|
[default: <path>-reified.nt] |
339
|
|
|
""" |
340
|
|
|
), |
341
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
342
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
343
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
344
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
345
|
|
|
) -> None: |
346
|
|
|
""" |
347
|
|
|
Outputs reified semantic triples. |
348
|
|
|
""" |
349
|
|
|
set_up(log, quiet, verbose) |
350
|
|
|
default = f"{path}-reified.nt" |
351
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
352
|
|
|
hits = HitFrame.read_file(path).to_hits() |
353
|
|
|
with to.open() as f: |
|
|
|
|
354
|
|
|
for triple in Reifier().reify(hits): |
355
|
|
|
f.write(triple.n_triples) |
356
|
|
|
|
357
|
|
|
@staticmethod |
358
|
|
|
def copy( |
|
|
|
|
359
|
|
|
path: Path = Ca.file_input, |
|
|
|
|
360
|
|
|
to: Optional[Path] = Opt.out_path( |
|
|
|
|
361
|
|
|
rf""" |
362
|
|
|
Path to the output file. |
363
|
|
|
|
364
|
|
|
{Ca.output_formats} |
365
|
|
|
|
366
|
|
|
[default: <path.parent>/export{DEF_SUFFIX}] |
367
|
|
|
""" |
368
|
|
|
), |
369
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
370
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
371
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
372
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
373
|
|
|
) -> None: |
374
|
|
|
""" |
375
|
|
|
Copies and/or converts annotation files. |
376
|
|
|
|
377
|
|
|
Example: ``:export:copy --to .snappy`` to highly compress a data set. |
378
|
|
|
""" |
379
|
|
|
set_up(log, quiet, verbose) |
380
|
|
|
default = path.parent / DEF_SUFFIX |
381
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
382
|
|
|
df = HitFrame.read_file(path) |
|
|
|
|
383
|
|
|
df.write_file(to) |
384
|
|
|
|
385
|
|
|
@staticmethod |
386
|
|
|
def analyze( |
|
|
|
|
387
|
|
|
path: Path = Ca.file_input, |
|
|
|
|
388
|
|
|
phi: Path = Ca.input_matrix, |
|
|
|
|
389
|
|
|
scores: Path = Ca.alpha_input, |
|
|
|
|
390
|
|
|
seed: int = Ca.seed, |
|
|
|
|
391
|
|
|
samples: int = Ca.boot, |
|
|
|
|
392
|
|
|
to: Optional[Path] = Ca.misc_out_dir, |
|
|
|
|
393
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
394
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
395
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
396
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
397
|
|
|
) -> None: |
398
|
|
|
""" |
399
|
|
|
Shorthand for multiple calculations and plots. |
400
|
|
|
|
401
|
|
|
Generates n-triple statements and reified n-triples. |
402
|
|
|
Calculates correlation and enrichment using ``scores``, |
403
|
|
|
psi matrices (one per variable), and concordance between psi and tau matrices (tau). |
404
|
|
|
Plots UMAP of psi variables, enrichment bar plots, correlation violin plots, |
405
|
|
|
phi-vs-psi scatter and line plots, and phi-vs-psi (tau) violin plots. |
406
|
|
|
""" |
407
|
|
|
|
408
|
|
|
@staticmethod |
409
|
|
|
def alpha( |
|
|
|
|
410
|
|
|
path: Path = Ca.file_input, |
|
|
|
|
411
|
|
|
scores: Path = Ca.alpha_input, |
|
|
|
|
412
|
|
|
bool_alg: Optional[str] = Opt.val( |
|
|
|
|
413
|
|
|
rf""" |
414
|
|
|
Algorithm to use for scores starting with 'is_'. |
415
|
|
|
|
416
|
|
|
Allowed values: {Ca.list(BoolAlg)} |
417
|
|
|
""", |
418
|
|
|
default="alpha", |
419
|
|
|
), |
420
|
|
|
real_alg: Optional[str] = Opt.val( |
|
|
|
|
421
|
|
|
rf""" |
422
|
|
|
Algorithm to use for scores starting with 'score_'. |
423
|
|
|
|
424
|
|
|
Allowed values: {Ca.list(RealAlg)} |
425
|
|
|
""", |
426
|
|
|
default="weighted", |
427
|
|
|
), |
428
|
|
|
on: bool = Ca.on, |
|
|
|
|
429
|
|
|
boot: int = Ca.boot, |
|
|
|
|
430
|
|
|
seed: int = Ca.seed, |
|
|
|
|
431
|
|
|
to: Optional[Path] = Ca.alpha_to, |
|
|
|
|
432
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
433
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
434
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
435
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
436
|
|
|
) -> None: |
437
|
|
|
""" |
438
|
|
|
Compare annotations to user-supplied values. |
439
|
|
|
|
440
|
|
|
Calculates correlation between provided scores and object/predicate pairs. |
441
|
|
|
For booleans, compares annotations for hits and non-hits. |
442
|
|
|
See the docs for more info. |
443
|
|
|
""" |
444
|
|
|
set_up(log, quiet, verbose) |
445
|
|
|
default = f"{path}-{scores.name}{DEF_SUFFIX}" |
446
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
447
|
|
|
hits = HitFrame.read_file(path) |
448
|
|
|
scores = ScoreDf.read_file(scores) |
449
|
|
|
calculator = EnrichmentCalculation(bool_alg, real_alg, boot, seed) |
450
|
|
|
df = calculator.calculate(hits, scores) |
|
|
|
|
451
|
|
|
df.write_file(to) |
452
|
|
|
|
453
|
|
|
@staticmethod |
454
|
|
|
def psi( |
|
|
|
|
455
|
|
|
path: Path = Ca.file_input, |
|
|
|
|
456
|
|
|
algorithm: str = Opt.val( |
|
|
|
|
457
|
|
|
r""" |
458
|
|
|
The algorithm for calculating similarity between annotation sets. |
459
|
|
|
|
460
|
|
|
Currently, only "j" (J') is supported. Refer to the docs for the equation. |
461
|
|
|
""", |
462
|
|
|
default="j", |
463
|
|
|
), |
464
|
|
|
to: Path = Ca.output_matrix, |
|
|
|
|
465
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
466
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
467
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
468
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
469
|
|
|
) -> None: |
470
|
|
|
r""" |
471
|
|
|
Calculate a similarity matrix from annotations. |
472
|
|
|
|
473
|
|
|
The data are output as a dataframe (CSV by default), where rows and columns correspond |
474
|
|
|
to compounds, and the cell i,j is the overlap J' in annotations between compounds i and j. |
475
|
|
|
""" |
476
|
|
|
set_up(log, quiet, verbose) |
477
|
|
|
default = path.parent / (algorithm + DEF_SUFFIX) |
478
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
479
|
|
|
hits = HitFrame.read_file(path).to_hits() |
480
|
|
|
calculator = MatrixCalculation.create(algorithm) |
481
|
|
|
matrix = calculator.calc_all(hits) |
482
|
|
|
matrix.write_file(to) |
483
|
|
|
|
484
|
|
|
@staticmethod |
485
|
|
|
def calc_ecfp_psi( |
|
|
|
|
486
|
|
|
path: Path = CommonArgs.compounds, |
|
|
|
|
487
|
|
|
radius: int = Opt.val(r"""Radius of the ECFP fingerprint.""", default=4), |
|
|
|
|
488
|
|
|
n_bits: int = Opt.val(r"""Number of bits.""", default=2048), |
|
|
|
|
489
|
|
|
psi: bool = Opt.flag( |
|
|
|
|
490
|
|
|
r"""Use "psi" as the type in the resulting matrix instead of "phi".""" |
491
|
|
|
), |
492
|
|
|
to: Path = Ca.output_matrix, |
|
|
|
|
493
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
494
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
495
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
496
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
497
|
|
|
) -> None: |
498
|
|
|
r""" |
499
|
|
|
Compute a similarity matrix from ECFP fingerprints. |
500
|
|
|
|
501
|
|
|
Requires rdkit to be installed. |
502
|
|
|
|
503
|
|
|
This is a bit faster than computing using a search and then calculating with ``:calc:psi``. |
504
|
|
|
Values range from 0 (no overlap) to 1 (identical). |
505
|
|
|
The type will be "phi" -- in contrast to using :calc:phi. |
506
|
|
|
See ``:calc:phi`` for more info. |
507
|
|
|
This is most useful for comparing a phenotypic phi against pure structural similarity. |
508
|
|
|
""" |
509
|
|
|
set_up(log, quiet, verbose) |
510
|
|
|
name = f"ecfp{radius}-n{n_bits}" |
511
|
|
|
default = path.parent / (name + DEF_SUFFIX) |
512
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
513
|
|
|
df = InputFrame.read_file(path) |
|
|
|
|
514
|
|
|
kind = "psi" if psi else "phi" |
515
|
|
|
short = MatrixPrep.ecfp_matrix(df, radius, n_bits) |
516
|
|
|
long_form = MatrixPrep(kind, False, False, False).create({name: short}) |
517
|
|
|
long_form.write_file(to) |
518
|
|
|
|
519
|
|
|
@staticmethod |
520
|
|
|
def tau( |
|
|
|
|
521
|
|
|
phi: Path = Ca.input_matrix, |
|
|
|
|
522
|
|
|
psi: Path = Ca.input_matrix, |
|
|
|
|
523
|
|
|
algorithm: str = Opt.val( |
|
|
|
|
524
|
|
|
r""" |
525
|
|
|
The algorithm for calculating concordance. |
526
|
|
|
|
527
|
|
|
Currently, only "tau" is supported. |
528
|
|
|
This calculation is a modified Kendall’s τ-a, where disconcordant ignores ties. |
529
|
|
|
See the docs for more info. |
530
|
|
|
""", |
531
|
|
|
default="tau", |
532
|
|
|
), |
533
|
|
|
seed: int = Ca.seed, |
|
|
|
|
534
|
|
|
samples: int = Ca.boot, |
|
|
|
|
535
|
|
|
to: Optional[Path] = Opt.out_file( |
|
|
|
|
536
|
|
|
rf""" |
537
|
|
|
The path to a table for output. |
538
|
|
|
|
539
|
|
|
{Ca.output_formats} |
540
|
|
|
|
541
|
|
|
[default: <input-path.parent>/<algorithm>-concordance.{DEF_SUFFIX}] |
542
|
|
|
""", |
543
|
|
|
), |
544
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
545
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
546
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
547
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
548
|
|
|
) -> None: |
549
|
|
|
r""" |
550
|
|
|
Calculate correlation between matrices. |
551
|
|
|
|
552
|
|
|
Values are calculated over bootstrap, outputting a table. |
553
|
|
|
|
554
|
|
|
Phi is typically a phenotypic matrix, and psi a matrix from Mandos. |
555
|
|
|
This command is designed to calculate the similarity between compound annotations |
556
|
|
|
(from Mandos) and some user-input compound–compound similarity matrix. |
557
|
|
|
(For example, vectors from a high-content cell screen. |
558
|
|
|
See ``:calc:correlation`` or ``:calc:enrichment`` if you have a single variable, |
559
|
|
|
such as a hit or lead-like score. |
560
|
|
|
""" |
561
|
|
|
set_up(log, quiet, verbose) |
562
|
|
|
default = phi.parent / f"{psi.stem}-{algorithm}{DEF_SUFFIX}" |
563
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
564
|
|
|
phi = SimilarityDfLongForm.read_file(phi) |
565
|
|
|
psi = SimilarityDfLongForm.read_file(psi) |
566
|
|
|
calculator = ConcordanceCalculation.create(algorithm, phi, psi, samples, seed) |
567
|
|
|
concordance = calculator.calc_all(phi, psi) |
568
|
|
|
concordance.write_file(to) |
569
|
|
|
|
570
|
|
|
@staticmethod |
571
|
|
|
def calc_umap( |
|
|
|
|
572
|
|
|
psi_matrix: Path = Ca.input_matrix, |
|
|
|
|
573
|
|
|
algorithm: str = Opt.val( |
|
|
|
|
574
|
|
|
r""" |
575
|
|
|
Projection algorithm. |
576
|
|
|
|
577
|
|
|
Currently only "umap" is supported. |
578
|
|
|
""", |
579
|
|
|
default="umap", |
580
|
|
|
), |
581
|
|
|
seed: str = Opt.val( |
|
|
|
|
582
|
|
|
r""" |
583
|
|
|
Random seed (integer or 'none'). |
584
|
|
|
|
585
|
|
|
Setting to 'none' may increase performance. |
586
|
|
|
""", |
587
|
|
|
default=0, |
588
|
|
|
), |
589
|
|
|
params: str = Opt.val( |
|
|
|
|
590
|
|
|
rf""" |
591
|
|
|
Parameters fed to the algorithm. |
592
|
|
|
|
593
|
|
|
This is a comma-separated list of key=value pairs. |
594
|
|
|
For example: ``n_neighbors=4,n_components=12,min_dist=0.8`` |
595
|
|
|
Supports all UMAP parameters except random_state and metric: |
596
|
|
|
|
597
|
|
|
{Ca.definition_list(_umap_params) if UMAP else "<list is unavailable>"} |
598
|
|
|
""", |
599
|
|
|
default="", |
600
|
|
|
), |
601
|
|
|
to: Optional[Path] = Ca.project_to, |
|
|
|
|
602
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
603
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
604
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
605
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
606
|
|
|
) -> None: |
607
|
|
|
r""" |
608
|
|
|
Calculate compound UMAP from psi matrices. |
609
|
|
|
|
610
|
|
|
The input should probably be calculated from ``:calc:matrix``. |
611
|
|
|
Saves a table of the UMAP coordinates. |
612
|
|
|
""" |
613
|
|
|
if algorithm == "umap" and UMAP is None: |
614
|
|
|
raise ImportError(f"UMAP is not available") |
|
|
|
|
615
|
|
|
|
616
|
|
|
@staticmethod |
617
|
|
|
def prep_phi( |
|
|
|
|
618
|
|
|
matrices: List[Path] = Ca.input_matrix_short_form, |
|
|
|
|
619
|
|
|
kind: str = Ca.var_type, |
|
|
|
|
620
|
|
|
to: Path = Ca.output_matrix, |
|
|
|
|
621
|
|
|
replace: bool = Ca.replace, |
|
|
|
|
622
|
|
|
normalize: bool = Opt.flag( |
|
|
|
|
623
|
|
|
r"""Rescale values to between 0 and 1 by (v-min) / (max-min). (Performed after negation.)""" |
|
|
|
|
624
|
|
|
), |
625
|
|
|
log10: bool = Opt.val(r"""Rescales values by log10. (Performed after normalization.)"""), |
|
|
|
|
626
|
|
|
invert: bool = Opt.val(r"""Multiplies the values by -1. (Performed first.)"""), |
|
|
|
|
627
|
|
|
log: Optional[Path] = CommonArgs.log_path, |
|
|
|
|
628
|
|
|
quiet: bool = CommonArgs.quiet, |
|
|
|
|
629
|
|
|
verbose: bool = CommonArgs.verbose, |
|
|
|
|
630
|
|
|
): |
631
|
|
|
r""" |
632
|
|
|
Convert phi matrices to one long-form matrix. |
633
|
|
|
|
634
|
|
|
The keys will be derived from the filenames. |
635
|
|
|
""" |
636
|
|
|
set_up(log, quiet, verbose) |
637
|
|
|
default = "." |
638
|
|
|
if to is None: |
639
|
|
|
try: |
640
|
|
|
default = next(iter({mx.parent for mx in matrices})) |
641
|
|
|
except StopIteration: |
642
|
|
|
logger.warning(f"Outputting to {default}") |
643
|
|
|
to = MiscUtils.adjust_filename(to, default, replace) |
644
|
|
|
long_form = MatrixPrep(kind, normalize, log10, invert).from_files(matrices) |
645
|
|
|
long_form.write_file(to) |
646
|
|
|
|
647
|
|
|
@staticmethod |
648
|
|
|
def plot_umap( |
|
|
|
|
649
|
|
|
umap_df: Path = Ca.project_input, |
|
|
|
|
650
|
|
|
style: Optional[Path] = Ca.style_for_compounds, |
|
|
|
|
651
|
|
|
color_col: Optional[str] = Ca.color_col, |
|
|
|
|
652
|
|
|
marker_col: Optional[str] = Ca.marker_col, |
|
|
|
|
653
|
|
|
to: Optional[Path] = Ca.plot_to, |
|
|
|
|
654
|
|
|
) -> None: |
655
|
|
|
r""" |
656
|
|
|
Plot UMAP, etc. of compounds from psi matrices. |
657
|
|
|
|
658
|
|
|
Will plot one variable (psi) per column. |
659
|
|
|
""" |
660
|
|
|
|
661
|
|
|
@staticmethod |
662
|
|
|
def plot_score( |
|
|
|
|
663
|
|
|
path: Path = Ca.input_correlation, |
|
|
|
|
664
|
|
|
kind: str = Ca.plot_kind, |
|
|
|
|
665
|
|
|
style: Optional[Path] = Ca.style_for_pairs, |
|
|
|
|
666
|
|
|
color_col: Optional[str] = Ca.color_col, |
|
|
|
|
667
|
|
|
marker_col: Optional[str] = Ca.marker_col, |
|
|
|
|
668
|
|
|
ci: float = Ca.ci, |
|
|
|
|
669
|
|
|
to: Optional[Path] = Ca.plot_to, |
|
|
|
|
670
|
|
|
) -> None: |
671
|
|
|
r""" |
672
|
|
|
Plot correlation to scores. |
673
|
|
|
|
674
|
|
|
Visualizes the correlation between predicate/object pairs and user-supplied scores. |
675
|
|
|
Will output one figure (file) per scoring function. |
676
|
|
|
Will plot (psi, score-fn) pairs over a grid, |
677
|
|
|
one row per scoring function and column per psi. |
678
|
|
|
""" |
679
|
|
|
|
680
|
|
|
@staticmethod |
681
|
|
|
def plot_pairing( |
|
|
|
|
682
|
|
|
path: Path = Ca.input_matrix, |
|
|
|
|
683
|
|
|
join: Optional[bool] = Opt.flag( |
|
|
|
|
684
|
|
|
r""" |
685
|
|
|
Pool all psi variables into a single column with multiple plots. |
686
|
|
|
""" |
687
|
|
|
), |
688
|
|
|
kind: str = Opt.val( |
|
|
|
|
689
|
|
|
r""" |
690
|
|
|
Either 'points', 'lines', or 'points+lines'. |
691
|
|
|
|
692
|
|
|
- points: Scatter plots of (phi, psi) values. |
693
|
|
|
|
694
|
|
|
- lines: Plot a linear interpolation. |
695
|
|
|
|
696
|
|
|
- ci: Plot a linear interpolation with a confidence band. |
697
|
|
|
|
698
|
|
|
- points+lines: Both 'points' and 'lines'. |
699
|
|
|
""", |
700
|
|
|
"--type", |
701
|
|
|
), |
702
|
|
|
ci: float = Ca.ci, |
|
|
|
|
703
|
|
|
sort_by: str = Opt.val( |
|
|
|
|
704
|
|
|
r""" |
705
|
|
|
Which axis to sort by: 'phi'/'x' or 'psi'/'y'. |
706
|
|
|
|
707
|
|
|
Sorting by psi values (y-axis) makes it easier to compare psi variables, |
708
|
|
|
while sorting by phi values (x-axis) makes it easier to compare phi variables. |
709
|
|
|
""", |
710
|
|
|
default="psi", |
711
|
|
|
), |
712
|
|
|
style: Optional[Path] = Ca.style_for_psi, |
|
|
|
|
713
|
|
|
color_col: Optional[str] = Ca.color_col, |
|
|
|
|
714
|
|
|
marker_col: Optional[str] = Ca.marker_col, |
|
|
|
|
715
|
|
|
to: Optional[Path] = Ca.plot_to, |
|
|
|
|
716
|
|
|
) -> None: |
717
|
|
|
r""" |
718
|
|
|
Plot line plots of phi against psi. |
719
|
|
|
|
720
|
|
|
Plots scatter plots of (phi, psi) values, sorted by phi values. |
721
|
|
|
All plots are log/log (all similarity values should be scaled from 0 to 1). |
722
|
|
|
|
723
|
|
|
For each unique phi matrix and psi matrix, flattens the matrices and plots |
724
|
|
|
the flattened (n choose 2 - n) pairs of each jointly, phi mapped to the y-axis |
725
|
|
|
and psi mapped to the x-axis. |
726
|
|
|
|
727
|
|
|
Without --split: |
728
|
|
|
|
729
|
|
|
Will show values for all psi variables together. |
730
|
|
|
If ``--color`` is not set, will choose a palette. |
731
|
|
|
Works best with ``--type lines``. |
732
|
|
|
|
733
|
|
|
With --split: |
734
|
|
|
|
735
|
|
|
Will plot each (phi, psi) pair over a grid, one plot per cell: |
736
|
|
|
One row per phi and one column per psi. |
737
|
|
|
""" |
738
|
|
|
|
739
|
|
|
@staticmethod |
740
|
|
|
def plot_pairing_violin( |
|
|
|
|
741
|
|
|
path: Path = Ca.input_matrix, |
|
|
|
|
742
|
|
|
split: bool = Opt.flag( |
|
|
|
|
743
|
|
|
r""" |
744
|
|
|
Split each violin into phi #1 on the left and phi #2 on the right. |
745
|
|
|
|
746
|
|
|
Useful to compare two phi variables. Requires exactly 2. |
747
|
|
|
""" |
748
|
|
|
), |
749
|
|
|
style: Optional[Path] = Ca.style_for_psi, |
|
|
|
|
750
|
|
|
color_col: Optional[str] = Ca.color_col, |
|
|
|
|
751
|
|
|
marker_col: Optional[str] = Ca.marker_col, |
|
|
|
|
752
|
|
|
to: Optional[Path] = Ca.plot_to, |
|
|
|
|
753
|
|
|
) -> None: |
754
|
|
|
r""" |
755
|
|
|
Plot violin plots from tau values. |
756
|
|
|
|
757
|
|
|
The input data should be generated by ``:calc:phi-vs-psi.tau``. |
758
|
|
|
|
759
|
|
|
Will plot each (phi, psi) pair over a grid, one row per phi and one column per psi |
760
|
|
|
(unless ``--split`` is set). |
761
|
|
|
""" |
762
|
|
|
|
763
|
|
|
|
764
|
|
|
__all__ = ["MiscCommands"] |
765
|
|
|
|