com.strider.datadefender.discoverer.Discoverer   A
last analyzed

Complexity

Total Complexity 8

Size/Duplication

Total Lines 75
Duplicated Lines 0 %

Test Coverage

Coverage 0%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 39
c 1
b 0
f 0
dl 0
loc 75
ccs 0
cts 21
cp 0
rs 10
wmc 8

5 Methods

Rating   Name   Duplication   Size   Complexity  
A Discoverer(ModelDiscoveryConfig) 0 7 2
A createModel(File) 0 2 1
A calculateAverage(List) 0 12 3
A createModelFrom(TokenNameFinderModel,String) 0 3 1
A createModel(String) 0 3 1
1
/*
2
 *
3
 * Copyright 2014, Armenak Grigoryan, and individual contributors as indicated
4
 * by the @authors tag. See the copyright.txt in the distribution for a
5
 * full listing of individual contributors.
6
 *
7
 * This is free software; you can redistribute it and/or modify it
8
 * under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation; either version 2.1 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * This software is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
 * Lesser General Public License for more details.
16
 */
17
package com.strider.datadefender.discoverer;
18
19
import com.strider.datadefender.DataDefenderException;
20
import com.strider.datadefender.ModelDiscoveryConfig;
21
import com.strider.datadefender.database.metadata.TableMetaData.ColumnMetaData;
22
import com.strider.datadefender.requirement.file.Generator;
23
24
import java.io.File;
25
import java.io.IOException;
26
import java.io.InputStream;
27
import java.util.List;
28
import java.util.Map;
29
import java.util.stream.Collectors;
30
31
import javax.xml.bind.JAXBException;
32
33
import opennlp.tools.namefind.NameFinderME;
34
import opennlp.tools.namefind.TokenNameFinderModel;
35
import opennlp.tools.tokenize.TokenizerME;
36
import opennlp.tools.tokenize.TokenizerModel;
37
38
import lombok.Data;
39
import lombok.extern.log4j.Log4j2;
40
41
/**
42
 * Holds common logic for Discoverers.
43
 * @author Akira Matsuo
44
 */
45
@Log4j2
46
public abstract class Discoverer {
47
48
    public static final String DEFAULT_TOKEN_MODEL = "en-token.bin";
49
    public static final Map<String, String> BUILT_IN_MODELS = Map.of(
50
        "date", "en-ner-date.bin",
51
        "location", "en-ner-location.bin",
52
        "money", "en-ner-money.bin",
53
        "organization", "en-ner-organization.bin",
54
        "person", "en-ner-person.bin",
55
        "time", "en-ner-time.bin"
56
    );
57
    
58
    @Data
59
    public static class ColumnMatch {
60
        final private ColumnMetaData column;
61
        final private double averageProbability;
62
        final private String model;
63
        final private List<Probability> probabilityList;
64
    }
65
66
    protected List<ColumnMatch> matches;
67
68
    protected final ModelDiscoveryConfig config;
69
    protected final TokenizerME tokenizer;
70
71
    public Discoverer(ModelDiscoveryConfig config) throws IOException {
72
        this.config = config;
73
        if (config.getTokenModel() != null) {
74
            tokenizer = new TokenizerME(new TokenizerModel(config.getTokenModel()));
75
        } else {
76
            try (InputStream stream = Discoverer.class.getResourceAsStream(DEFAULT_TOKEN_MODEL)) {
77
                tokenizer = new TokenizerME(new TokenizerModel(stream));
78
            }
79
        }
80
    }
81
82
    public double calculateAverage(final List<Probability> values) {
83
        Double sum = 0.0;
84
85
        if (!values.isEmpty()) {
86
            for (final Probability value : values) {
87
                sum += value.getProbabilityValue();
88
            }
89
90
            return sum / values.size();
91
        }
92
93
        return sum;
94
    }
95
96
    private Model createModelFrom(TokenNameFinderModel tnf, String modelName) {
97
        NameFinderME nameFinder = new NameFinderME(tnf);
98
        return new Model(tokenizer, nameFinder, modelName);
99
    }
100
101
    /**
102
     * Creates model POJO based on OpenNLP model file
103
     *
104
     * @param modelName
105
     * @return Model
106
     */
107
    public Model createModel(final File modelFile) throws IOException {
108
        return createModelFrom(new TokenNameFinderModel(modelFile), modelFile.getName());
109
    }
110
111
    /**
112
     * Creates model POJO based on a built-in OpenNLP model
113
     *
114
     * @param modelName
115
     * @return Model
116
     */
117
    public Model createModel(final String modelName) throws IOException {
118
        try (InputStream stream = Discoverer.class.getResourceAsStream(BUILT_IN_MODELS.get(modelName))) {
119
            return createModelFrom(new TokenNameFinderModel(stream), modelName);
120
        }
121
    }
122
}