/*
 * Decompiled with CFR 0.152.
 */
package org.corpus_tools.peppermodules.nlpModules;

import com.neovisionaries.i18n.LanguageCode;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.io.FilenameUtils;
import org.corpus_tools.pepper.common.DOCUMENT_STATUS;
import org.corpus_tools.pepper.impl.PepperManipulatorImpl;
import org.corpus_tools.pepper.impl.PepperMapperImpl;
import org.corpus_tools.pepper.modules.PepperMapper;
import org.corpus_tools.pepper.modules.PepperModule;
import org.corpus_tools.pepper.modules.exceptions.PepperModuleException;
import org.corpus_tools.pepper.modules.exceptions.PepperModuleNotReadyException;
import org.corpus_tools.peppermodules.nlpModules.TokenizerProperties;
import org.corpus_tools.salt.common.SDocumentGraph;
import org.corpus_tools.salt.common.STextualDS;
import org.corpus_tools.salt.graph.Identifier;
import org.eclipse.emf.common.util.URI;
import org.osgi.service.component.annotations.Component;

@Component(name="TokenizerComponent", factory="PepperManipulatorComponentFactory")
public class Tokenizer
extends PepperManipulatorImpl {
    private Map<LanguageCode, HashSet<String>> abbreviationMap = null;

    public Tokenizer() {
        this.setName("Tokenizer");
        this.setSupplierContact(URI.createURI((String)"saltnpepper@lists.hu-berlin.de"));
        this.setSupplierHomepage(URI.createURI((String)"https://github.com/korpling/pepperModules-nlpModules"));
        this.setDesc("The tokenizer tokenzizes a document using the tokenizer provided by Salt. The tokenizer uses abbreviation lists and is implemented along the Treetaggers tokenizer by Helmut Schmid (see: http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/). ");
        this.setProperties(new TokenizerProperties());
    }

    public boolean isReadyToStart() throws PepperModuleNotReadyException {
        if (((TokenizerProperties)this.getProperties()).getAbbreviationFolder() != null) {
            this.loadAbbFolder();
        }
        return true;
    }

    private void loadAbbFolder() {
        File abbFolder = ((TokenizerProperties)this.getProperties()).getAbbreviationFolder();
        File[] abbFiles = abbFolder.listFiles();
        if (abbFiles != null) {
            for (File abbFile : abbFiles) {
                String ending = FilenameUtils.getExtension((String)abbFile.getName());
                LanguageCode langCode = LanguageCode.valueOf((String)ending);
                if (langCode == null) continue;
                if (this.abbreviationMap == null) {
                    this.abbreviationMap = new ConcurrentHashMap<LanguageCode, HashSet<String>>();
                }
                HashSet<String> abbreviations = null;
                try {
                    abbreviations = new HashSet<String>();
                    BufferedReader inReader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(abbFile.getAbsolutePath()), "UTF8"));
                    String input = "";
                    while ((input = inReader.readLine()) != null) {
                        abbreviations.add(input);
                    }
                    inReader.close();
                }
                catch (FileNotFoundException e) {
                    throw new PepperModuleException((PepperModule)this, "Cannot tokenize the given text, because the file for abbreviation '" + abbFile.getAbsolutePath() + "' was not found.");
                }
                catch (IOException e) {
                    throw new PepperModuleException((PepperModule)this, "Cannot tokenize the given text, because can not read file '" + abbFile.getAbsolutePath() + "'.");
                }
                this.abbreviationMap.put(langCode, abbreviations);
            }
        }
    }

    public PepperMapper createPepperMapper(Identifier sElementId) {
        TokenizerMapper mapper = new TokenizerMapper();
        return mapper;
    }

    private class TokenizerMapper
    extends PepperMapperImpl {
        private TokenizerMapper() {
        }

        public DOCUMENT_STATUS mapSDocument() {
            SDocumentGraph sDocGraph = this.getDocument().getDocumentGraph();
            if (sDocGraph != null) {
                org.corpus_tools.peppermodules.nlpModules.tokenizer.Tokenizer tokenizer = new org.corpus_tools.peppermodules.nlpModules.tokenizer.Tokenizer();
                tokenizer.setsDocumentGraph(sDocGraph);
                tokenizer.setAddTextToSpan(((TokenizerProperties)this.getProperties()).getAddTextToSpan());
                if (Tokenizer.this.abbreviationMap != null) {
                    Set keys = Tokenizer.this.abbreviationMap.keySet();
                    for (LanguageCode lang : keys) {
                        tokenizer.addAbbreviation(lang, (HashSet)Tokenizer.this.abbreviationMap.get(lang));
                    }
                }
                if (sDocGraph.getTextualDSs() != null && sDocGraph.getTextualDSs().size() > 0) {
                    ArrayList<STextualDS> texts = new ArrayList<STextualDS>();
                    for (STextualDS sTextualDs : sDocGraph.getTextualDSs()) {
                        texts.add(sTextualDs);
                    }
                    for (STextualDS sTextualDs : texts) {
                        tokenizer.tokenize(sTextualDs);
                    }
                }
            }
            return DOCUMENT_STATUS.COMPLETED;
        }
    }
}

