| Conditions | 13 |
| Total Lines | 83 |
| Code Lines | 71 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like mandos.analysis.distances.JPrimeMatrixCalculator.calc_all() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """ |
||
| 78 | def calc_all(self, path: Path, to: Path, *, keep_temp: bool = False) -> SimilarityDfLongForm: |
||
| 79 | hits = HitDf.read_file(path).to_hits() |
||
| 80 | key_to_hit = Au.hit_multidict(hits, "search_key") |
||
| 81 | logger.notice(f"Calculating J on {len(key_to_hit):,} keys from {len(hits):,} hits") |
||
| 82 | deltas, files, good_keys = [], [], {} |
||
| 83 | for key, key_hits in key_to_hit.items(): |
||
| 84 | key: str = key |
||
| 85 | key_hits: Sequence[AbstractHit] = key_hits |
||
| 86 | n_compounds_0 = len({k.origin_inchikey for k in key_hits}) |
||
| 87 | part_path = self._path_of(path, key) |
||
| 88 | n_compounds_in_mx = None |
||
| 89 | n_nonzero = None |
||
| 90 | df = None |
||
| 91 | if part_path.exists(): |
||
| 92 | try: |
||
| 93 | df = SimilarityDfLongForm.read_file( |
||
| 94 | part_path, file_hash=False |
||
| 95 | ) # TODO: file_hash=True |
||
| 96 | logger.warning(f"Results for key {key} already exist ({len(df):,} rows)") |
||
| 97 | n_compounds_in_mx = len(df["inchikey_1"].unique()) |
||
| 98 | except HashFileMissingError: |
||
| 99 | logger.error(f"Extant results for key {key} appear incomplete; restarting") |
||
| 100 | logger.opt(exception=True).debug(f"Hash error for {key}") |
||
| 101 | unlink(part_path) |
||
| 102 | # now let it go into the next block -- calculate from scratch |
||
| 103 | if n_compounds_0 >= self.min_compounds: |
||
| 104 | t1 = time.monotonic() |
||
| 105 | df: SimilarityDfShortForm = self.calc_one(key, key_hits) |
||
| 106 | t2 = time.monotonic() |
||
| 107 | deltas.append(t2 - t1) |
||
| 108 | df = df.to_long_form(kind="psi", key=key) |
||
| 109 | n_compounds_in_mx = len(df["inchikey_1"].unique()) |
||
| 110 | df.write_file(part_path) |
||
| 111 | logger.debug(f"Wrote results for {key} to {part_path}") |
||
| 112 | if df is not None: |
||
| 113 | n_nonzero = len(df[df["value"] > 0]) |
||
| 114 | if n_compounds_in_mx < self.min_compounds: |
||
| 115 | logger.warning( |
||
| 116 | f"Key {key} has {n_compounds_in_mx:,} < {self.min_compounds:,} compounds; skipping" |
||
| 117 | ) |
||
| 118 | elif len(key_hits) < self.min_hits: |
||
| 119 | logger.warning( |
||
| 120 | f"Key {key} has {len(key_hits):,} < {self.min_hits:,} hits; skipping" |
||
| 121 | ) |
||
| 122 | elif n_nonzero is not None and n_nonzero < self.min_nonzero: |
||
| 123 | logger.warning( |
||
| 124 | f"Key {key} has {n_nonzero:,} < {self.min_nonzero:,} nonzero pairs; skipping" |
||
| 125 | ) # TODO: percent nonzero? |
||
| 126 | else: |
||
| 127 | files.append(part_path) |
||
| 128 | good_keys[key] = n_compounds_in_mx |
||
| 129 | del df |
||
| 130 | logger.debug(f"Concatenating {len(files):,} files") |
||
| 131 | df = SimilarityDfLongForm( |
||
| 132 | pd.concat( |
||
| 133 | [SimilarityDfLongForm.read_file(self._path_of(path, k)) for k in good_keys.keys()] |
||
| 134 | ) |
||
| 135 | ) |
||
| 136 | logger.notice(f"Included {len(good_keys):,} keys: {', '.join(good_keys.keys())}") |
||
| 137 | quartiles = {} |
||
| 138 | for k, v in good_keys.items(): |
||
| 139 | vals = df[df["key"] == k]["value"] |
||
| 140 | qs = {x: vals.quantile(x) for x in [0, 0.25, 0.5, 0.75, 1]} |
||
| 141 | quartiles[k] = list(qs.values()) |
||
| 142 | logger.info(f"Key {k} has {v:,} compounds and {len(key_to_hit[k]):,} hits") |
||
| 143 | logger.info( |
||
| 144 | f" {k} {Chars.fatright} unique values = {len(vals.unique())} unique values" |
||
| 145 | ) |
||
| 146 | logger.info(f" {k} {Chars.fatright} quartiles: " + " | ".join(qs.values())) |
||
| 147 | df = df.set_attrs( |
||
| 148 | dict( |
||
| 149 | keys={ |
||
| 150 | k: dict(compounds=v, hits=len(key_to_hit[k]), quartiles=quartiles[k]) |
||
| 151 | for k, v in good_keys.items() |
||
| 152 | } |
||
| 153 | ) |
||
| 154 | ) |
||
| 155 | df.write_file(to, attrs=True, file_hash=True) |
||
| 156 | logger.notice(f"Wrote {len(df):,} rows to {to}") |
||
| 157 | if not keep_temp: |
||
| 158 | for k in key_to_hit.keys(): |
||
| 159 | unlink(self._path_of(path, k)) |
||
| 160 | return df |
||
| 161 | |||
| 242 |