package org.ansj.app.phrase;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.ansj.domain.Term;
import org.ansj.library.StopLibrary;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.nlpcn.commons.lang.util.CollectionUtil;
import org.nlpcn.commons.lang.util.MapCount;
import org.nlpcn.commons.lang.util.StringUtil;

/* loaded from: input_file:org/ansj/app/phrase/PhraseExtractor.class */
public class PhraseExtractor {
    public static double DEFAULT_TERM_FREQUENCY = 10.24d;
    public static int TERM_MAP_CAPACITY = 100000;
    public static int OCCURRENCE_MAP_CAPACITY = 100000;
    public static float FACTOR = 0.2f;
    public static int DEDUP_THRESHOLD = 2000;
    private StopRecognition sr;
    private int totalTerm;
    private Analysis analysis = new NlpAnalysis();
    private int length = 10;
    private final Map<String, Integer> termMap = new HashMap(TERM_MAP_CAPACITY);
    private final Map<String, Occurrence> occurrenceMap = new HashMap(OCCURRENCE_MAP_CAPACITY);

    public PhraseExtractor() {
        final StopRecognition stopRecognition = StopLibrary.get();
        this.sr = new StopRecognition() { // from class: org.ansj.app.phrase.PhraseExtractor.1
            @Override // org.ansj.recognition.impl.StopRecognition
            public boolean filter(Term term) {
                String natureStr = term.getNatureStr();
                return StringUtil.isBlank(natureStr) || "null".equals(natureStr) || "w".equals(natureStr) || (stopRecognition != null && stopRecognition.filter(term));
            }
        };
    }

    public PhraseExtractor setAnalysis(Analysis analysis) {
        this.analysis = analysis;
        return this;
    }

    public PhraseExtractor setStopRecognition(StopRecognition stopRecognition) {
        this.sr = stopRecognition;
        return this;
    }

    public PhraseExtractor setLength(int i) {
        this.length = i;
        return this;
    }

    public void fromText(String str) {
        ArrayList arrayList = new ArrayList();
        StringBuilder sb = new StringBuilder();
        for (List<Term> list : seg2sentence(str)) {
            this.totalTerm += list.size();
            int size = list.size();
            for (int i = 0; i < size; i++) {
                for (int i2 = i; i2 < size; i2++) {
                    String realName = list.get(i2).getRealName();
                    if (this.termMap.containsKey(realName)) {
                        this.termMap.put(realName, Integer.valueOf(this.termMap.get(realName).intValue() + 1));
                    } else {
                        addTerm(realName);
                    }
                    sb.append(realName);
                    if (this.length < sb.length()) {
                        break;
                    }
                    arrayList.add(list.get(i2));
                    String sb2 = sb.toString();
                    Occurrence occurrence = this.occurrenceMap.containsKey(sb2) ? this.occurrenceMap.get(sb2) : new Occurrence(new ArrayList(arrayList));
                    if (0 < i) {
                        occurrence.addLeftTerm(list.get(i - 1).getRealName());
                    }
                    if (i2 < size - 1) {
                        occurrence.addRightTerm(list.get(i2 + 1).getRealName());
                    }
                    occurrence.increaseFrequency();
                    if (occurrence.getFrequency() == 1) {
                        addOccurrence(sb2, occurrence);
                    }
                }
                arrayList.clear();
                sb.delete(0, sb.length());
            }
        }
    }

    private void addTerm(String str) {
        if (((int) (TERM_MAP_CAPACITY * (1.0f + FACTOR))) <= this.termMap.size()) {
            List sortMapByValue = CollectionUtil.sortMapByValue(this.termMap, 1);
            Iterator it = sortMapByValue.subList(TERM_MAP_CAPACITY, sortMapByValue.size()).iterator();
            while (it.hasNext()) {
                this.termMap.remove(((Map.Entry) it.next()).getKey());
            }
        }
        this.termMap.put(str, 1);
    }

    private void addOccurrence(String str, Occurrence occurrence) {
        if (((int) (OCCURRENCE_MAP_CAPACITY * (1.0f + FACTOR))) <= this.occurrenceMap.size()) {
            calculateScore();
            ArrayList arrayList = new ArrayList(this.occurrenceMap.entrySet());
            Collections.sort(arrayList, new Comparator<Map.Entry<String, Occurrence>>() { // from class: org.ansj.app.phrase.PhraseExtractor.2
                @Override // java.util.Comparator
                public int compare(Map.Entry<String, Occurrence> entry, Map.Entry<String, Occurrence> entry2) {
                    return -Double.compare(entry.getValue().getScore(), entry2.getValue().getScore());
                }
            });
            Iterator it = arrayList.subList(OCCURRENCE_MAP_CAPACITY, arrayList.size()).iterator();
            while (it.hasNext()) {
                this.occurrenceMap.remove(((Map.Entry) it.next()).getKey());
            }
        }
        this.occurrenceMap.put(str, occurrence);
    }

    public List<Map.Entry<String, Occurrence>> nbest(int i) {
        calculateScore();
        Set<String> hashSet = new HashSet<>();
        for (Map.Entry<String, Occurrence> entry : this.occurrenceMap.entrySet()) {
            Occurrence value = entry.getValue();
            List<Term> terms = value.getTerms();
            if ((terms.size() < 2 && !terms.get(0).isNewWord() && entry.getKey().length() < 5) || entry.getKey().length() < 2 || Double.compare(Math.max(value.getLeftEntropy(), value.getRightEntropy()), 0.0d) <= 0) {
                hashSet.add(entry.getKey());
            }
        }
        this.occurrenceMap.keySet().removeAll(hashSet);
        hashSet.clear();
        List<Map.Entry<String, Occurrence>> arrayList = new ArrayList<>(this.occurrenceMap.entrySet());
        if (this.occurrenceMap.size() < DEDUP_THRESHOLD) {
            dedup(arrayList, hashSet);
            this.occurrenceMap.keySet().removeAll(hashSet);
            arrayList.clear();
            arrayList.addAll(this.occurrenceMap.entrySet());
        }
        Collections.sort(arrayList, new Comparator<Map.Entry<String, Occurrence>>() { // from class: org.ansj.app.phrase.PhraseExtractor.3
            @Override // java.util.Comparator
            public int compare(Map.Entry<String, Occurrence> entry2, Map.Entry<String, Occurrence> entry3) {
                return -Double.compare(entry2.getValue().getScore(), entry3.getValue().getScore());
            }
        });
        ArrayList arrayList2 = new ArrayList(i);
        for (Map.Entry<String, Occurrence> entry2 : arrayList) {
            if (arrayList2.size() == i) {
                break;
            }
            arrayList2.add(entry2);
        }
        return arrayList2;
    }

    private void calculateScore() {
        Set<Map.Entry<String, Occurrence>> entrySet = this.occurrenceMap.entrySet();
        double d = 0.0d;
        double d2 = 0.0d;
        double d3 = 0.0d;
        MapCount mapCount = new MapCount();
        MapCount mapCount2 = new MapCount();
        for (Map.Entry<String, Occurrence> entry : entrySet) {
            Occurrence value = entry.getValue();
            value.setPmi(calculateMutualInformation(entry.getKey(), value.getTerms()));
            d += value.getPmi();
            value.setLeftEntropy(calculateEntropy(value.getLeftTerms()));
            d2 += value.getLeftEntropy();
            value.setRightEntropy(calculateEntropy(value.getRightTerms()));
            d3 += value.getRightEntropy();
            double size = value.getTerms().size() - 1;
            for (Term term : value.getTerms()) {
                mapCount.add(term.getRealName());
                mapCount2.add(term.getRealName(), size);
            }
        }
        HashMap hashMap = new HashMap(mapCount.size());
        for (Map.Entry entry2 : mapCount.get().entrySet()) {
            hashMap.put(entry2.getKey(), Double.valueOf((((Double) mapCount2.get().get(entry2.getKey())).doubleValue() + ((Double) entry2.getValue()).doubleValue()) / ((Double) entry2.getValue()).doubleValue()));
        }
        double d4 = 0.0d;
        Iterator<Map.Entry<String, Occurrence>> it = entrySet.iterator();
        while (it.hasNext()) {
            Occurrence value2 = it.next().getValue();
            double d5 = 0.0d;
            Iterator<Term> it2 = value2.getTerms().iterator();
            while (it2.hasNext()) {
                d5 += ((Double) hashMap.get(it2.next().getRealName())).doubleValue();
            }
            value2.setScore(d5);
            d4 += d5;
        }
        Iterator<Map.Entry<String, Occurrence>> it3 = entrySet.iterator();
        while (it3.hasNext()) {
            Occurrence value3 = it3.next().getValue();
            value3.setScore((value3.getPmi() / d) + (value3.getLeftEntropy() / d2) + (value3.getRightEntropy() / d3) + (value3.getScore() / d4));
            calculateScore2(value3);
        }
    }

    private void calculateScore2(Occurrence occurrence) {
        String natureStr = occurrence.getTerms().get(0).getNatureStr();
        if ("c".equals(natureStr) || "p".equals(natureStr) || 'u' == natureStr.charAt(0)) {
            occurrence.setScore(occurrence.getScore() * 0.001d);
        }
        String natureStr2 = occurrence.getTerms().get(occurrence.getTerms().size() - 1).getNatureStr();
        if ("c".equals(natureStr2) || "p".equals(natureStr2) || 'u' == natureStr2.charAt(0)) {
            occurrence.setScore(occurrence.getScore() * 0.001d);
        }
    }

    private double calculateEntropy(MapCount<String> mapCount) {
        double d = 0.0d;
        Set entrySet = mapCount.get().entrySet();
        Iterator it = entrySet.iterator();
        while (it.hasNext()) {
            d += ((Double) ((Map.Entry) it.next()).getValue()).doubleValue();
        }
        double d2 = 0.0d;
        Iterator it2 = entrySet.iterator();
        while (it2.hasNext()) {
            double doubleValue = ((Double) ((Map.Entry) it2.next()).getValue()).doubleValue() / d;
            d2 += (-doubleValue) * Math.log(doubleValue);
        }
        return d2;
    }

    private double calculateMutualInformation(String str, List<Term> list) {
        if (list.size() == 1) {
            return -Math.log(getFrequency(list.get(0).getRealName()) / this.totalTerm);
        }
        double d = 1.0d;
        Iterator<Term> it = list.iterator();
        while (it.hasNext()) {
            d *= getFrequency(it.next().getRealName());
        }
        return Math.log(this.occurrenceMap.get(str).getFrequency() * Math.pow(this.totalTerm, r0 - 1)) - Math.log(d);
    }

    private double getFrequency(String str) {
        return this.termMap.containsKey(str) ? this.termMap.get(str).intValue() : DEFAULT_TERM_FREQUENCY;
    }

    private void dedup(List<Map.Entry<String, Occurrence>> list, Set<String> set) {
        int size = list.size();
        for (int i = 0; i < size; i++) {
            Map.Entry<String, Occurrence> entry = list.get(i);
            double pmi = entry.getValue().getPmi();
            double leftEntropy = (entry.getValue().getLeftEntropy() + entry.getValue().getRightEntropy()) / 2.0d;
            int i2 = i + 1;
            while (true) {
                if (i2 < size) {
                    Map.Entry<String, Occurrence> entry2 = list.get(i2);
                    double pmi2 = entry2.getValue().getPmi();
                    double leftEntropy2 = (entry2.getValue().getLeftEntropy() + entry2.getValue().getRightEntropy()) / 2.0d;
                    if (!entry.getKey().contains(entry2.getKey())) {
                        if (!entry2.getKey().contains(entry.getKey())) {
                            if (0.75d < calculateCosineSimilarity(entry.getValue().getTerms(), entry2.getValue().getTerms())) {
                                if (entry2.getKey().length() > entry.getKey().length() || 0 > Double.compare(pmi, pmi2) || 0 > Double.compare(leftEntropy, leftEntropy2)) {
                                    if (entry.getKey().length() <= entry2.getKey().length() && 0 <= Double.compare(pmi2, pmi) && 0 <= Double.compare(leftEntropy2, leftEntropy)) {
                                        set.add(list.get(i).getKey());
                                        break;
                                    }
                                } else {
                                    set.add(list.get(i2).getKey());
                                }
                            } else {
                                continue;
                            }
                        } else if (0 <= Double.compare(pmi2, pmi) && 0 <= Double.compare(leftEntropy2, leftEntropy)) {
                            set.add(list.get(i).getKey());
                            break;
                        }
                    } else if (0 <= Double.compare(pmi, pmi2) && 0 <= Double.compare(leftEntropy, leftEntropy2)) {
                        set.add(list.get(i2).getKey());
                    }
                    i2++;
                }
            }
        }
    }

    private double calculateCosineSimilarity(List<Term> list, List<Term> list2) {
        MapCount mapCount = new MapCount();
        Iterator<Term> it = list.iterator();
        while (it.hasNext()) {
            mapCount.add(it.next().getRealName());
        }
        double d = 0.0d;
        Iterator it2 = mapCount.get().values().iterator();
        while (it2.hasNext()) {
            d += Math.pow(((Double) it2.next()).doubleValue(), 2.0d);
        }
        if (Double.compare(d, 0.0d) <= 0) {
            return 0.0d;
        }
        MapCount mapCount2 = new MapCount();
        Iterator<Term> it3 = list2.iterator();
        while (it3.hasNext()) {
            mapCount2.add(it3.next().getRealName());
        }
        double d2 = 0.0d;
        Iterator it4 = mapCount2.get().values().iterator();
        while (it4.hasNext()) {
            d2 += Math.pow(((Double) it4.next()).doubleValue(), 2.0d);
        }
        if (Double.compare(d2, 0.0d) <= 0) {
            return 0.0d;
        }
        double d3 = 0.0d;
        HashSet<String> hashSet = new HashSet(mapCount.get().keySet());
        hashSet.retainAll(mapCount2.get().keySet());
        for (String str : hashSet) {
            d3 += ((Double) mapCount.get().get(str)).doubleValue() * ((Double) mapCount2.get().get(str)).doubleValue();
        }
        return d3 / (Math.sqrt(d) * Math.sqrt(d2));
    }

    private List<List<Term>> seg2sentence(String str) {
        List<String> sentenceList = toSentenceList(str);
        ArrayList arrayList = new ArrayList(sentenceList.size());
        Iterator<String> it = sentenceList.iterator();
        while (it.hasNext()) {
            arrayList.add(this.analysis.parseStr(it.next()).recognition(this.sr).getTerms());
        }
        return arrayList;
    }

    private List<String> toSentenceList(String str) {
        StringBuilder sb = new StringBuilder();
        LinkedList linkedList = new LinkedList();
        int i = 0;
        int length = str.length();
        while (i < length) {
            char charAt = str.charAt(i);
            if (sb.length() != 0 || (!Character.isWhitespace(charAt) && charAt != ' ')) {
                sb.append(charAt);
                switch (charAt) {
                    case '\t':
                    case '\n':
                    case '\r':
                    case ' ':
                    case '!':
                    case ',':
                    case ';':
                    case '?':
                    case 160:
                    case 12290:
                    case 65281:
                    case 65292:
                    case 65307:
                    case 65311:
                        insertIntoList(sb, linkedList);
                        sb = new StringBuilder();
                        break;
                    case '.':
                        if (i < length - 1 && str.charAt(i + 1) > 128) {
                            insertIntoList(sb, linkedList);
                            sb = new StringBuilder();
                            break;
                        }
                        break;
                    case 8230:
                        if (i < length - 1 && str.charAt(i + 1) == 8230) {
                            sb.append((char) 8230);
                            i++;
                            insertIntoList(sb, linkedList);
                            sb = new StringBuilder();
                            break;
                        }
                        break;
                }
            }
            i++;
        }
        if (sb.length() > 0) {
            insertIntoList(sb, linkedList);
        }
        return linkedList;
    }

    private void insertIntoList(StringBuilder sb, List<String> list) {
        String trim = sb.toString().trim();
        if (trim.length() > 0) {
            list.add(trim);
        }
    }
}
