1
|
|
|
#!/usr/bin/env python |
2
|
|
|
|
3
|
|
|
import click |
4
|
|
|
from topic_modeling_toolkit.reporting import TopicsHandler |
5
|
|
|
|
6
|
|
|
|
7
|
|
|
@click.command() |
8
|
|
|
@click.option('--dataset', '-d', required=True, prompt="Which i the dataset the model was trained on? (input dataset string label)", |
9
|
|
|
help="The dataset that was used to train the model on.") |
10
|
|
|
@click.option('--model-label', '-m-l', required=True, prompt="Which model do you want to query for its topics? (input model label)", |
11
|
|
|
help="The model label to use searching for stored experimental results.") |
12
|
|
|
@click.option('--topics-set', '-t-s', default='domain', show_default=True, type=click.Choice(["background", "domain"]), |
13
|
|
|
help="Common lexis should be collected in 'background' topics. 'Domain' topics should be free of common lexis.") |
14
|
|
|
@click.option('--tokens-type', '-t-t', default='top-tokens', show_default=True, |
15
|
|
|
help="'top-tokens' is a list sorted on p(w|t). 'kernel' is a list sorted on p(t|w); should be accompanied by " |
16
|
|
|
"threshold, ie 'kernel60' -> 0.60, 'kernel25' -> 0.25") |
17
|
|
|
@click.option('--sort', '-s', default='name', show_default=True, |
18
|
|
|
help="Reports back the list of topics sorted on the metric. 'name': alphabetically by name, 'coh': by kernel " |
19
|
|
|
"coherence, 'con': by kernel contrast, 'pur': by kernel purity. The last 3 options require a threshold similar to the " |
20
|
|
|
"'tokens-type' arguments. Example syntaxes are: 'coh-80', 'con-25', 'pur-90'.") |
21
|
|
|
@click.option('--columns', '-c', default=10, show_default=True, |
22
|
|
|
help="The number of columns (each corresponding to a topic's tokens group) to include per row'") |
23
|
|
|
@click.option('--number-of-tokens', '-nb-tokens', default=15, show_default=True, |
24
|
|
|
help="The maximum number of tokens to show per topic. If requested background tokens to report then this " |
25
|
|
|
"argument correspond to the total amount of bg tokens to show.") |
26
|
|
|
@click.option('--show_metrics/--no-show_metrics', show_default=True, default=True, |
27
|
|
|
help="Whether to print kernel coherence, contrast and purity for each individual topic. It requires a kernel " |
28
|
|
|
"definition (threshold) to be inputted from '--tokens-type' or '--sort', else it has no effect.") |
29
|
|
|
@click.option('--show_title/--no-show_title', show_default=True, default=False, |
30
|
|
|
help="Whether to print a title on top of the table of topics ") |
31
|
|
|
def main(dataset, model_label, topics_set, tokens_type, sort, columns, number_of_tokens, show_metrics, show_title): |
32
|
|
|
collections_dir = os.getenv('COLLECTIONS_DIR') |
|
|
|
|
33
|
|
|
if not collections_dir: |
34
|
|
|
raise RuntimeError("Please set the COLLECTIONS_DIR environment variable with the path to a directory containing collections/datasets") |
35
|
|
|
topic_handler = TopicsHandler(collections_dir) |
36
|
|
|
if topics_set == 'background': |
37
|
|
|
b = topic_handler.pformat_background([dataset, model_label], |
38
|
|
|
columns=columns, |
39
|
|
|
nb_tokens=number_of_tokens, |
40
|
|
|
show_title=show_title) |
41
|
|
|
else: |
42
|
|
|
b = topic_handler.pformat([dataset, model_label], |
43
|
|
|
topics_set, |
44
|
|
|
tokens_type, |
45
|
|
|
sort, |
46
|
|
|
number_of_tokens, |
47
|
|
|
columns, |
48
|
|
|
topic_info=show_metrics, |
49
|
|
|
show_title=show_title) |
50
|
|
|
print(b) # '--s_m/--no-s_m' |
51
|
|
|
|
52
|
|
|
if __name__ == '__main__': |
53
|
|
|
main() |
54
|
|
|
|