discover()   F
last analyzed

Complexity

Conditions 13

Size

Total Lines 124
Code Lines 85

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 182

Importance

Changes 0
Metric Value
cc 13
eloc 85
c 0
b 0
f 0
dl 0
loc 124
ccs 0
cts 59
cp 0
crap 182
rs 3.4309

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like com.strider.datadefender.discoverer.DatabaseDiscoverer.discover() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
/*
2
 *
3
 * Copyright 2014-2019, Armenak Grigoryan, and individual contributors as indicated
4
 * by the @authors tag. See the copyright.txt in the distribution for a
5
 * full listing of individual contributors.
6
 *
7
 * This is free software; you can redistribute it and/or modify it
8
 * under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation; either version 2.1 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * This software is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 */
18
package com.strider.datadefender.discoverer;
19
20
import com.strider.datadefender.DataDefenderException;
21
import com.strider.datadefender.ModelDiscoveryConfig;
22
import com.strider.datadefender.database.IDbFactory;
23
import com.strider.datadefender.database.metadata.IMetaData;
24
import com.strider.datadefender.database.metadata.TableMetaData;
25
import com.strider.datadefender.database.metadata.TableMetaData.ColumnMetaData;
26
import com.strider.datadefender.database.sqlbuilder.ISqlBuilder;
27
import com.strider.datadefender.functions.Utils;
28
import com.strider.datadefender.report.ReportUtil;
29
import com.strider.datadefender.specialcase.SpecialCase;
30
import com.strider.datadefender.utils.Score;
31
32
import java.io.File;
33
import java.io.IOException;
34
import java.io.InputStream;
35
import java.lang.reflect.InvocationTargetException;
36
import java.lang.reflect.Method;
37
import java.nio.charset.StandardCharsets;
38
import java.sql.Blob;
39
import java.sql.Clob;
40
import java.sql.Date;
41
import java.sql.ResultSet;
42
import java.sql.SQLException;
43
import java.sql.Statement;
44
import java.sql.Time;
45
import java.sql.Timestamp;
46
import java.text.DateFormat;
47
import java.text.ParseException;
48
import java.text.SimpleDateFormat;
49
import java.util.ArrayList;
50
import java.util.List;
51
import java.util.Locale;
52
import java.util.Objects;
53
import java.util.stream.Collectors;
54
55
import me.tongfei.progressbar.ProgressBar;
56
57
import org.apache.commons.collections4.CollectionUtils;
58
import org.apache.commons.collections4.ListUtils;
59
import org.apache.commons.io.IOUtils;
60
import org.apache.commons.lang3.ClassUtils;
61
import org.apache.commons.lang3.StringUtils;
62
63
import opennlp.tools.util.Span;
64
65
import lombok.extern.log4j.Log4j2;
66
67
/**
68
 *
69
 * @author Armenak Grigoryan
70
 */
71
@Log4j2
72
public class DatabaseDiscoverer extends Discoverer {
73
74
    protected final IDbFactory factory;
75
76
    public DatabaseDiscoverer(ModelDiscoveryConfig config, IDbFactory factory) throws IOException {
77
        super(config);
78
        this.factory = factory;
79
    }
80
81
    /**
82
     * Calls a function defined as an extension
83
     * @param function
84
     * @param data
85
     * @param text
86
     * @return
87
     * @throws SQLException
88
     * @throws NoSuchMethodException
89
     * @throws SecurityException
90
     * @throws IllegalAccessException
91
     * @throws IllegalArgumentException
92
     * @throws InvocationTargetException
93
     */
94
    private Object callExtension(final String function, final ColumnMetaData data, final String text)
95
            throws SQLException, NoSuchMethodException, SecurityException, IllegalAccessException,
96
                   IllegalArgumentException, InvocationTargetException {
97
98
        if (StringUtils.isBlank(function)) {
99
            return null;
100
        }
101
102
        Object value = null;
103
104
        try {
105
            final String className = Utils.getClassName(function);
106
            final String methodName = Utils.getMethodName(function);
107
            final Method method = Class.forName(className).getDeclaredMethod(
108
                methodName,
109
                new Class[] { ColumnMetaData.class, String.class }
110
            );
111
            final SpecialCase instance = (SpecialCase) Class.forName(className).getConstructor().newInstance();
112
            value = method.invoke(instance, data, text);
113
114
        } catch (InstantiationException | ClassNotFoundException ex) {
115
            log.error(ex.toString());
116
            log.debug(ex.toString(), ex);
117
        }
118
119
        return value;
120
    }
121
122
    @SuppressWarnings("unchecked")
123
    public List<ColumnMetaData> discover()
124
        throws ParseException,
125
        DataDefenderException,
126
        IOException,
127
        SQLException {
128
129
        List<ColumnMatch> finalList = new ArrayList<>();
130
131
        try (ProgressBar pb = new ProgressBar(
132
            "Discovering by model...",
133
            CollectionUtils.size(config.getModels()) + CollectionUtils.size(config.getFileModels())
134
        )) {
135
            for (final String sm : CollectionUtils.emptyIfNull(config.getModels())) {
136
                log.info("********************************");
137
                log.info("Processing model " + sm);
138
                log.info("********************************");
139
                pb.setExtraMessage("Model: " + sm);
140
141
                final Model model = createModel(sm);
142
                matches = discoverAgainstSingleModel(model);
143
                finalList = ListUtils.union(finalList, matches);
144
                pb.step();
145
            }
146
            for (final File fm : CollectionUtils.emptyIfNull(config.getFileModels())) {
147
                log.info("********************************");
148
                log.info("Processing model " + fm);
149
                log.info("********************************");
150
                pb.setExtraMessage("Model: " + fm.getName());
151
152
                final Model model = createModel(fm);
153
                matches = discoverAgainstSingleModel(model);
154
                finalList = ListUtils.union(finalList, matches);
155
                pb.step();
156
            }
157
        }
158
159
        log.info("List of suspects:");
160
161
        final Score score           = new Score();
162
        int         highRiskColumns = 0;
163
        int         rowCount        = 0;
164
165
        for (final ColumnMatch match : finalList) {
166
167
            ColumnMetaData column = match.getColumn();
168
            // Row count
169
            if (config.getCalculateScore()) {
170
                log.debug("Counting number of rows ...");
171
                rowCount = ReportUtil.rowCount(factory, 
172
                               column.getTable().getTableName());
173
            } else {
174
                log.debug("Skipping counting number of rows ...");
175
            }
176
177
            // Getting 5 sample values
178
            final List<String> sampleDataList = ReportUtil.sampleData(factory, column);
179
            // Output
180
            log.info("Column                      : " + column.toString());
181
            log.info(StringUtils.repeat('=', column.toString().length() + 30));
182
            log.info("Model                       : " + match.getModel());
183
            log.info("Number of rows in the table : " + rowCount);
184
185
            if (config.getCalculateScore()) {
186
                log.info("Score                       : " + score.columnScore(rowCount));
187
            } else {
188
                log.info("Score                       : N/A");
189
            }
190
191
            log.info("Sample data");
192
            log.info(StringUtils.repeat('-', 11));
193
            
194
            sampleDataList.forEach((sampleData) -> {
195
                log.info(sampleData);
196
            });
197
198
            log.info("");
199
200
            // Score calculation is evaluated with score_calculation parameter
201
            if (config.getCalculateScore() && score.columnScore(rowCount).equals("High")) {
202
                highRiskColumns++;
203
            }
204
        }
205
206
        // Only applicable when parameter table_rowcount=yes otherwise score calculation should not be done
207
        if (config.getCalculateScore()) {
208
            log.info("Overall score: " + score.dataStoreScore());
209
            log.info("");
210
211
            if ((finalList != null) && (finalList.size() > 0)) {
212
                log.info("============================================");
213
214
                if (finalList.size() > config.getThresholdCount()) {
215
                    log.info(
216
                        "Number of PI [{}] columns is higher than defined threashold [{}]",
217
                        finalList.size(),
218
                        config.getThresholdCount()
219
                    );
220
                } else {
221
                    log.info(
222
                        "Number of PI [{}] columns is lower than or equal to defined threashold [{}]",
223
                        finalList.size(),
224
                        config.getThresholdCount()
225
                    );
226
                }
227
                if (highRiskColumns > config.getThresholdHighRisk()) {
228
                    log.info(
229
                        "Number of High risk PI [{}] columns is higher than defined threashold [{}]",
230
                        highRiskColumns,
231
                        config.getThresholdHighRisk()
232
                    );
233
                } else {
234
                    log.info(
235
                        "Number of High risk PI [{}] columns is lower than or equal to defined threashold [{}]",
236
                        highRiskColumns,
237
                        config.getThresholdHighRisk()
238
                    );
239
                }
240
            }
241
        } else {
242
            log.info("Overall score: N/A");
243
        }
244
245
        return matches.stream().map((c) -> c.getColumn()).collect(Collectors.toList());
246
    }
247
248
    private List<ColumnMatch> discoverAgainstSingleModel(final Model model)
249
        throws ParseException,
250
        DataDefenderException,
251
        IOException,
252
        SQLException {
253
254
        final IMetaData           metaData = factory.fetchMetaData();
255
        final List<TableMetaData> map      = metaData.getMetaData();
256
257
        // Start running NLP algorithms for each column and collect percentage
258
        matches = new ArrayList<>();
259
260
        ColumnMatch             specialCaseData;
261
        final List<ColumnMatch> specialCaseDataList  = new ArrayList();
262
        List<String>            specialCaseFunctions = config.getExtensions();
263
        boolean                 specialCase          = CollectionUtils.isNotEmpty(specialCaseFunctions);
264
265
        log.info("Extension list: {}", specialCaseFunctions);
266
267
        final ISqlBuilder sqlBuilder = factory.createSQLBuilder();
268
        List<Probability> probabilityList;
269
270
        for (final TableMetaData table : map) {
271
272
            final String tableName  = table.getTableName();
273
            final String prefixed = sqlBuilder.prefixSchema(tableName);
274
            final String cntQuery = "SELECT COUNT(*) FROM " + prefixed;
275
276
            int numRows = config.getLimit();
277
            try (
278
                Statement stmt = factory.getConnection().createStatement();
279
                ResultSet rs = stmt.executeQuery(cntQuery)
280
            ) {
281
                rs.next();
282
                numRows = Math.min(numRows, rs.getInt(1));
283
            } catch (SQLException e) {
284
            }
285
286
            List<ColumnMetaData> cols = table.getColumns().stream()
287
                .filter((c) -> !c.isForeignKey() && !c.isPrimaryKey())
288
                .collect(Collectors.toList());
289
290
            int numSteps = numRows * cols.size();
291
            try (ProgressBar pb = new ProgressBar(model.getName() + " in " + tableName, numSteps)) {
292
                for (final ColumnMetaData data : cols) {
293
                    
294
                    final String columnName = data.getColumnName();
295
                    pb.setExtraMessage(columnName);
296
297
                    log.debug("Column type: [" + data.getColumnType() + "]");
298
                    probabilityList = new ArrayList<>();
299
                    log.info("Analyzing column [" + tableName + "].[" + columnName + "]");
300
301
                    final String query = sqlBuilder.buildSelectWithLimit(
302
                        "SELECT " + columnName + " FROM "  + prefixed + " WHERE "
303
                            + columnName + " IS NOT NULL",
304
                        config.getLimit()
305
                    );
306
307
                    log.debug("Executing query against database: " + query);
308
                    try (
309
                        Statement stmt = factory.getConnection().createStatement();
310
                        ResultSet resultSet = stmt.executeQuery(query)
311
                    ) {
312
                        while (resultSet.next()) {
313
                            pb.step();
314
                            if (Objects.equals(Blob.class, data.getColumnType())) {
315
                                continue;
316
                            }
317
                            if (model.getName().equals("location") && ClassUtils.isAssignable(data.getColumnType(), Number.class)) {
318
                                continue;
319
                            }
320
321
                            String sentence = "";
322
                            if (Objects.equals(Clob.class, data.getColumnType())) {
323
                                Clob clob = resultSet.getClob(1);
324
                                InputStream is = clob.getAsciiStream();
325
                                sentence = IOUtils.toString(is, StandardCharsets.UTF_8.name());
326
                            } else {
327
                                sentence = resultSet.getString(1);
328
                            }
329
                            log.debug(sentence);
330
                            if (specialCaseFunctions != null && specialCase) {
331
                                try {
332
                                    for (String specialCaseFunction : specialCaseFunctions) {
333
                                        if ((sentence != null) && !sentence.isEmpty()) {
334
                                            log.debug("sentence: " + sentence);
335
                                            log.debug("data: " + data);
336
                                            specialCaseData = (ColumnMatch) callExtension(specialCaseFunction, data, sentence);
337
                                            if (specialCaseData != null) {
338
                                                if (!specialCaseDataList.contains(specialCaseData)) {
339
                                                    log.debug("Adding new special case data: " + specialCaseData.toString());
340
                                                    specialCaseDataList.add(specialCaseData);
341
                                                }
342
                                            } else {
343
                                                log.debug("No special case data found");
344
                                            }
345
                                        }
346
                                    }
347
                                } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
348
                                    log.error(e.toString());
349
                                }
350
                            }
351
352
                            if ((sentence != null) &&!sentence.isEmpty()) {
353
                                String processingValue;
354
355
                                if (Objects.equals(Date.class, data.getColumnType())
356
                                    || Objects.equals(Timestamp.class, data.getColumnType())
357
                                    || Objects.equals(Time.class, data.getColumnType())) {
358
359
                                    final DateFormat     originalFormat = new SimpleDateFormat(sentence, Locale.ENGLISH);
360
                                    final DateFormat     targetFormat   = new SimpleDateFormat("MMM d, yy", Locale.ENGLISH);
361
                                    final java.util.Date date           = originalFormat.parse(sentence);
362
363
                                    processingValue = targetFormat.format(date);
364
                                } else {
365
                                    processingValue = sentence;
366
                                }
367
368
                                // LOG.debug(sentence);
369
                                // Convert sentence into tokens
370
                                final String tokens[] = model.getTokenizer().tokenize(processingValue);
371
372
                                // Find names
373
                                final Span nameSpans[] = model.getNameFinder().find(tokens);
374
375
                                // find probabilities for names
376
                                final double[] spanProbs = model.getNameFinder().probs(nameSpans);
377
378
                                // Collect top X tokens with highest probability
379
                                // display names
380
                                for (int i = 0; i < nameSpans.length; i++) {
381
                                    final String span = nameSpans[i].toString();
382
383
                                    if (span.length() > 2) {
384
                                        log.debug("Span: " + span);
385
                                        log.debug("Covered text is: " + tokens[nameSpans[i].getStart()]);
386
                                        log.debug("Probability is: " + spanProbs[i]);
387
                                        probabilityList.add(new Probability(tokens[nameSpans[i].getStart()], spanProbs[i]));
388
                                    }
389
                                }
390
391
                                // From OpenNLP documentation:
392
                                // After every document clearAdaptiveData must be called to clear the adaptive data in the feature generators.
393
                                // Not calling clearAdaptiveData can lead to a sharp drop in the detection rate after a few documents.
394
                                model.getNameFinder().clearAdaptiveData();
395
                            }
396
                        }
397
                    } catch (SQLException sqle) {
398
                        log.error(sqle.toString());
399
                    }
400
401
                    final double averageProbability = calculateAverage(probabilityList);
402
403
                    if (averageProbability >= config.getProbabilityThreshold()) {
404
                        matches.add(new ColumnMatch(
405
                            data,
406
                            averageProbability,
407
                            model.getName(),
408
                            probabilityList)
409
                        );
410
                    }
411
                }
412
                pb.stepTo(numSteps);
413
            }
414
        }
415
416
        // Special processing
417
        if (!specialCaseDataList.isEmpty()) {
418
            log.debug("Special case data is processed :" + specialCaseDataList.toString());
419
420
            specialCaseDataList.forEach((specialData) -> {
421
                matches.add(specialData);
422
            });
423
        }
424
425
        return matches;
426
    }
427
}
428