discover()   F
last analyzed

Complexity

Conditions 16

Size

Total Lines 131
Code Lines 88

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 272

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 16
eloc 88
c 1
b 0
f 0
dl 0
loc 131
ccs 0
cts 74
cp 0
crap 272
rs 2.0727

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like com.strider.datadefender.discoverer.FileDiscoverer.discover() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
/*
2
 *
3
 * Copyright 2014-2015, Armenak Grigoryan, and individual contributors as indicated
4
 * by the @authors tag. See the copyright.txt in the distribution for a
5
 * full listing of individual contributors.
6
 *
7
 * This is free software; you can redistribute it and/or modify it
8
 * under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation; either version 2.1 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * This software is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
 * Lesser General Public License for more details.
16
 */
17
package com.strider.datadefender.discoverer;
18
19
import com.strider.datadefender.DataDefenderException;
20
import com.strider.datadefender.ModelDiscoveryConfig;
21
import com.strider.datadefender.file.metadata.FileMatchMetaData;
22
import com.strider.datadefender.functions.Utils;
23
import com.strider.datadefender.specialcase.SpecialCase;
24
25
import java.text.DecimalFormat;
26
import java.io.File;
27
import java.io.FileInputStream;
28
import java.io.IOException;
29
import java.io.InputStream;
30
import java.lang.reflect.InvocationTargetException;
31
import java.lang.reflect.Method;
32
import java.util.ArrayList;
33
import java.util.Collections;
34
import java.util.Comparator;
35
import java.util.HashMap;
36
import java.util.HashSet;
37
import java.util.List;
38
import java.util.Locale;
39
import java.util.Map;
40
import java.sql.SQLException;
41
42
import org.apache.commons.collections4.CollectionUtils;
43
import org.apache.commons.collections4.ListUtils;
44
import org.apache.commons.io.FileUtils;
45
import org.apache.commons.io.FilenameUtils;
46
import org.apache.commons.lang3.StringUtils;
47
48
import org.apache.tika.exception.TikaException;
49
import org.apache.tika.metadata.Metadata;
50
import org.apache.tika.parser.AutoDetectParser;
51
import org.apache.tika.sax.BodyContentHandler;
52
53
import org.xml.sax.SAXException;
54
55
import opennlp.tools.util.Span;
56
57
import lombok.extern.log4j.Log4j2;
58
59
/**
60
 *
61
 * @author Armenak Grigoryan
62
 */
63
@Log4j2
64
public class FileDiscoverer extends Discoverer {
65
66
    protected final List<File> directories;
67
    protected List<FileMatchMetaData> fileMatches;
68
    protected List<String> excludeExtensions;
69
70
    public FileDiscoverer(ModelDiscoveryConfig config, List<File> directories, List<String> excludeExtensions) throws IOException {
71
        super(config);
72
        this.directories = directories;
73
        this.excludeExtensions = excludeExtensions;
74
    }
75
76
    public List<FileMatchMetaData> discover()
77
        throws FileDiscoveryException,
78
        DataDefenderException,
79
        IOException,
80
        SAXException,
81
        TikaException {
82
83
        List<FileMatchMetaData> finalList = new ArrayList<>();
84
85
        for (final String sm : CollectionUtils.emptyIfNull(config.getModels())) {
86
            log.info("********************************");
87
            log.info("Processing model " + sm);
88
            log.info("********************************");
89
90
            final Model model = createModel(sm);
91
            fileMatches = discoverAgainstSingleModel(model);
92
            finalList = ListUtils.union(finalList, fileMatches);
93
        }
94
        for (final File fm : CollectionUtils.emptyIfNull(config.getFileModels())) {
95
            log.info("********************************");
96
            log.info("Processing model " + fm);
97
            log.info("********************************");
98
99
            final Model model = createModel(fm);
100
            fileMatches = discoverAgainstSingleModel(model);
101
            finalList = ListUtils.union(finalList, fileMatches);
102
        }
103
104
        // Special case
105
        List<String> specialCaseFunctions = config.getExtensions();
106
        boolean specialCase = CollectionUtils.isNotEmpty(specialCaseFunctions);
107
        
108
        if (specialCase) {
109
            Metadata          metadata;
110
            try {
111
                log.info("**************" + specialCaseFunctions.toString());
112
                for (String fn : CollectionUtils.emptyIfNull(specialCaseFunctions)) {
113
                    for (final File node : directories) {
114
                        final List<File> files = (List<File>) FileUtils.listFiles(node, null, true);
115
116
                        for (final File fich : files) {
117
                            final String file         = fich.getName();
118
                            final String recursivedir = fich.getParent();
119
120
                            log.info("Analyzing [" + fich.getCanonicalPath() + "]");
121
                            final String ext = FilenameUtils.getExtension(fich.getName()).toLowerCase(Locale.ENGLISH);
122
                            log.debug("Extension: " + ext);
123
124
                            if (CollectionUtils.emptyIfNull(excludeExtensions).contains(ext)) {
125
                                log.info("Ignoring type " + ext);
126
                                continue;
127
                            }
128
129
                            final BodyContentHandler handler = new BodyContentHandler(-1);
130
                            final AutoDetectParser   parser  = new AutoDetectParser();
131
132
                            metadata = new Metadata();
133
134
                            String handlerString = "";
135
                            try (final InputStream stream = new FileInputStream(fich.getCanonicalPath())) {
136
                                
137
                                log.debug("Loading data into the stream");
138
                                if (stream != null) {
139
                                    parser.parse(stream, handler, metadata);
140
                                    handlerString = handler.toString().replaceAll("( )+", " ").replaceAll("[\\t\\n\\r]+"," ");
141
142
                                    String[] tokens = handlerString.split(" ");
143
144
                                    for (int t=0; t<tokens.length; t++) {
145
                                        String token = tokens[t];
146
                                        if (token.trim().length() < 1) {
147
                                            continue;
148
                                        }
149
                                        log.info(fn);
150
                                        FileMatchMetaData returnData = null;
151
                                        try {
152
                                            returnData = 
153
                                                (FileMatchMetaData)callExtension(new FileMatchMetaData(recursivedir, file), fn, token);
154
                                        } catch (InvocationTargetException e) {
155
                                            continue;
156
                                        }
157
                                        if (returnData != null) {
158
                                            returnData.setModel("sin");
159
                                            returnData.setAverageProbability(1.0);
160
                                            List<FileMatchMetaData> specialFileMatches = new ArrayList();
161
                                            specialFileMatches.add(returnData);
162
163
                                            finalList   = ListUtils.union(finalList, specialFileMatches);
164
                                        }
165
                                        log.debug(tokens[t]);                                            
166
                                    }
167
168
169
                                }
170
                            } catch (IOException e) {
171
                                log.info("Unable to read " + fich.getCanonicalPath() + ".Ignoring...");
172
                            }
173
                            log.info("Finish processing " + fich.getCanonicalPath());
174
                        }
175
                    }
176
                }
177
            } catch (IOException | IllegalAccessException | IllegalArgumentException | NoSuchMethodException | 
178
                    SecurityException | SQLException | TikaException | SAXException e) {
179
                log.error(e.toString());
180
            }
181
        }
182
183
        
184
        final DecimalFormat decimalFormat = new DecimalFormat("#.##");
185
186
        log.info("List of suspects:");
187
        log.info(String.format("%40s %20s %20s %20s", "Directory*", "File*", "Probability*", "Model*"));
188
        
189
        finalList = uniqueList(finalList);
190
        
191
        Collections.sort(finalList, Comparator.comparing(FileMatchMetaData ::getFileName));
192
        
193
        for (final FileMatchMetaData data : finalList) {
194
            String result = "";
195
            final String probability = decimalFormat.format(data.getAverageProbability());
196
            result      = String.format("%40s %20s %20s %20s",
197
                                                     data.getDirectory(),
198
                                                     data.getFileName(),
199
                                                     probability,
200
                                                     data.getModel());
201
            log.info(result);
202
        }
203
204
        
205
        
206
        return Collections.unmodifiableList(fileMatches);
207
    }
208
209
    private List<FileMatchMetaData> discoverAgainstSingleModel(final Model model)
210
        throws DataDefenderException,
211
        IOException,
212
        SAXException,
213
        TikaException {
214
215
        // Start running NLP algorithms for each column and collect percentage
216
        fileMatches = new ArrayList<>();
217
218
        log.info("Directories to analyze: {}", StringUtils.join(directories, ","));
219
220
        // Let's iterate over directories
221
        Metadata          metadata;
222
        for (final File node : directories) {
223
224
            final List<File> files = (List<File>) FileUtils.listFiles(node, null, true);
225
226
            for (final File fich : files) {
227
                final String file         = fich.getName();
228
                final String recursivedir = fich.getParent();
229
230
                log.info("Analyzing [" + fich.getCanonicalPath() + "]");
231
                final String ext = FilenameUtils.getExtension(fich.getName()).toLowerCase(Locale.ENGLISH);
232
                log.debug("Extension: " + ext);
233
234
                if (CollectionUtils.emptyIfNull(excludeExtensions).contains(ext)) {
235
                    log.info("Ignoring type " + ext);
236
                    continue;
237
                }
238
239
                final BodyContentHandler handler = new BodyContentHandler(-1);
240
                final AutoDetectParser   parser  = new AutoDetectParser();
241
242
                metadata = new Metadata();
243
244
                String handlerString = "";
245
                try {
246
                    final InputStream stream = new FileInputStream(fich.getCanonicalPath());
247
                    log.debug("Loading data into the stream");
248
                    if (stream != null) {
249
                        parser.parse(stream, handler, metadata);
250
                        handlerString = handler.toString();
251
                        log.debug(handlerString);
252
                    }
253
                } catch (IOException e) {
254
                    log.info("Unable to read " + fich.getCanonicalPath() + ".Ignoring...");
255
                }
256
257
                fileMatches = getMatchedFiles(model, handler.toString(), file, recursivedir);
258
            }
259
        }
260
261
        return fileMatches;
262
    }
263
    
264
    
265
    protected List<FileMatchMetaData> getMatchedFiles(final Model model, String handler, String file, String recursivedir) {
266
267
        final String   tokens[]    = model.getTokenizer().tokenize(handler);
268
        final Span     nameSpans[] = model.getNameFinder().find(tokens);
269
        final double[] spanProbs   = model.getNameFinder().probs(nameSpans);
270
        List<Probability> probabilityList = new ArrayList<>();
271
272
        for (int i = 0; i < nameSpans.length; i++) {
273
            log.info("Span: " + nameSpans[i].toString());
274
            log.info("Covered text is: " + tokens[nameSpans[i].getStart()]);
275
            log.info("Probability is: " + spanProbs[i]);
276
            probabilityList.add(new Probability(tokens[nameSpans[i].getStart()], spanProbs[i]));
277
        }
278
279
        model.getNameFinder().clearAdaptiveData();
280
281
        final double averageProbability = calculateAverage(probabilityList);
282
283
        if (averageProbability >= config.getProbabilityThreshold()) {
284
            final FileMatchMetaData result = new FileMatchMetaData(recursivedir, file);
285
286
            result.setAverageProbability(averageProbability);
287
            result.setModel(model.getName());
288
            fileMatches.add(result);
289
        }
290
        
291
        return fileMatches;
292
    }
293
294
    private Object callExtension(final FileMatchMetaData metadata, final String function, final String text)
295
            throws SQLException, NoSuchMethodException, SecurityException, IllegalAccessException,
296
                   IllegalArgumentException, InvocationTargetException 
297
    {
298
        if (StringUtils.isBlank(function)) {
299
            return null;
300
        }
301
302
        Object value = null;
303
304
        try {
305
            final String className = Utils.getClassName(function);
306
            final String methodName = Utils.getMethodName(function);
307
            final Method method = Class.forName(className).getMethod(
308
                methodName,
309
                new Class[] { FileMatchMetaData.class, String.class }
310
            );
311
            final SpecialCase instance = (SpecialCase) Class.forName(className).getConstructor().newInstance();
312
            final Map<String, Object> paramValues = new HashMap<>(2);
313
314
            paramValues.put("metadata", metadata);
315
            paramValues.put("text", text);
316
            log.info("before");
317
            log.info(text);
318
            value = method.invoke(instance, metadata, text);
319
            log.info("after");
320
        } catch (InstantiationException | ClassNotFoundException ex) {
321
            log.error(ex.toString());
322
        }
323
324
        return value;
325
    }
326
    
327
    private static List<FileMatchMetaData> uniqueList(List<FileMatchMetaData> finalList) {
328
        
329
        HashSet hs = new HashSet();
330
        hs.addAll(finalList);
331
        finalList.clear();
332
        finalList.addAll(hs);
333
        
334
        return finalList;
335
    }
336
337
}