#!/usr/bin/python # -*- coding: utf-8 -*- #### # Copyright (C) 2016 Kim Gerdes # kim AT gerdes. fr # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. #### import codecs, glob, random, shutil, time, sys, os, argparse, psutil, subprocess import regex as re #import conll verbose=False #verbose=True def parsing(infile, outfolder="parses/", memory=None, lemmatized=False): """ parsing function parserType is always graph! infile: an empty conll file """ if outfolder[-1]!="/": outfolder=outfolder+"/" outfile = outfolder+os.path.basename(infile) if outfile.endswith(".empty.conll"): outfile=outfile[:-len(".empty.conll")] if not memory: memory=str(psutil.virtual_memory().available/1000000000)+"G" anna, lemclass, tagclass, parseclass="mate/anna-3.3.jar", "is2.lemmatizer.Lemmatizer", "is2.tag.Tagger", "is2.parser.Parser" modelFolder="models/" lemodel=modelFolder+"LemModel" tagmodel=modelFolder+"TagModel" parsemodel=modelFolder+"ParseModel" lemcommand="java -Xmx{memory} -cp {anna} {lemclass} -model {lemodel} -test {infile} -out {outfile}.lem.conll".format(memory=memory, anna=anna, lemclass=lemclass, infile=infile, lemodel=lemodel, outfile=outfile) tagcommand="java -Xmx{memory} -cp {anna} {tagclass} -model {tagmodel} -test {outfile}.lem.conll -out {outfile}.tag.conll".format(memory=memory, anna=anna, tagclass=tagclass, tagmodel=tagmodel, outfile=outfile) parsecommand="java -Xmx{memory} -cp {anna} {parseclass} -model {parsemodel} -test {outfile}.tag.conll -out {outfile}.parse.conll".format(memory=memory, anna=anna, parseclass=parseclass, parsemodel=parsemodel, outfile=outfile) if lemodel and lemodel[-1]!="/": if verbose:print "\n\n========== lemmatizing...", lemcommand p1 = subprocess.Popen([lemcommand],shell=True, stdout=subprocess.PIPE) out, err = p1.communicate() if verbose: print out, err else: # TODO: add this part (for non inflectional languages like Chinese and for pre-lemmatized files) if lemmatized: print "copying",outfolder+os.path.basename(infile),"as lemma file" shutil.copyfile(outfolder+os.path.basename(infile), outfolder+os.path.basename(infile)+".lem.conll") else: print "adding toks as lems",outfolder+os.path.basename(infile) trees=conll.conllFile2trees(infile) with codecs.open(outfolder+os.path.basename(infile)+".lem.conll","w","utf-8") as lemf: for tree in trees: lemf.write(newconvert.treeToEmptyConll14Text(tree, lemma=False)+"\n") if verbose: print "\n\n========== tagging...", tagcommand p1 = subprocess.Popen([tagcommand],shell=True, stdout=subprocess.PIPE) out, err = p1.communicate() if verbose: print out, err print "\n\n========== dep analysis...", parsecommand p1 = subprocess.Popen([parsecommand],shell=True, stdout=subprocess.PIPE) out = p1.stdout.read() if verbose: print out print "\n\n========== parsed" #if checkIntegrity(outfile+'_parse') == False: #print "*********ERROR IN FILE", outfile+"_parse", "Please Try again*********" return outfile+".parse.conll" def lireDictionnaires(): """ Lecture des fichiers du lexique """ droporfeo='lexiqueMultiMots/' # where we can find the lexique folder specialCharDic={} #onlyLetters=re.compile(ur'''^\w+$''',re.U+re.I) for lexfile in glob.glob(droporfeo+'*.sfplm'): # Lecture des fichiers .sfplm #print "reading",lexfile with codecs.open(lexfile, "r", "utf8") as f: for ligne in f: if len(ligne) and "\t" in ligne: t,lem=ligne.strip().split("\t") #if (not onlyLetters.match(t[0])) or (not onlyLetters.match(t[1:-1].replace("-",""))) or not onlyLetters.match(t[-1]) : specialCharDic[t]=None if verbose: print len(specialCharDic),"special character words" specialCharWords = sorted(specialCharDic, key=len, reverse=True) return specialCharWords def removePuncsFromConllfile(conllfile): """ transforms the roots (0) to empty (-1) """ with codecs.open(conllfile,"r","utf-8") as f: conlltext=f.read() conlltext=conlltext.replace("0 _ punc","-1 _ punc") with codecs.open(conllfile,"w","utf-8") as f: f.write(conlltext) def emptyFromSentence(sentencefile, remultimatch=None,outfolder="."): """ sentencefile with one sentence per line --> conll14.empty.conll if remultimatch not empty, it is used for tokenization """ newname=os.path.basename(sentencefile) if newname.endswith(".txt"): newname=newname[:-len(".txt")] newname+=".empty.conll" outname=os.path.join(outfolder,newname) with codecs.open(sentencefile,"r","utf-8") as f, codecs.open(outname,"w","utf-8") as g: for li in f: if remultimatch: toks = tokenize(li,remultimatch) else: toks = simpletokenize(li) for num,tok in enumerate( toks ): g.write("\t".join([str(num+1), tok ]+["_"]*12)+'\n') g.write("\n") return outname def simpletokenize(text, returnMatchInfo=False): """ simple punctuation and space based tokenization specific for french (apostrophes are glued to the precedent word: d' un in english we'd need: do n't Mike 's """ reponct=re.compile(ur'''(\s*[.;:,!?\(\)§"'«»\d]+)''',re.U+re.M) # prepare for default punctuation matching. removed - from list renogroupponct=re.compile(ur'''(\s*[;:,«»\(\)"])''', re.U+re.M) # signs that have to be alone - they cannot be grouped ## do the remaining simple token-based splitting toks=[] text=reponct.sub(r" \1 ",text).replace(" '","'") # spaces around punctuation, but not before hyphen (french specific!) text=renogroupponct.sub(r" \1 ",text).replace(" ~","~") # spaces around no group punctuation return text.split() def numurltokenize(text, returnMatchInfo=False): """ number and url tokenization """ reurl=re.compile(ur'''(https?://|\w+@)?[\w\d\%\.]*\w\w\.\w\w[\w\d~/\%\#]*(\?[\w\d~/\%\#]+)*''', re.U+re.M+re.I) resignswithnumbers=re.compile(ur'''\d+[\d,.\s]+''', re.U+re.M) # do the url recognition toks=[] # couples : (tok, todo=0, done=1) laststart=0 for m in reurl.finditer(text): #print 'reurl:%02d-%02d: %s' % (m.start(), m.end(), m.group(0)) toks += [(text[laststart:m.start()],0), (m.group(0).strip(),1)] # #print toks laststart=m.end() toks += [(text[laststart:],0)] # #print toks # do the number recognition ntoks=[] for text,done in toks: if done: ntoks+=[(text,done)] else: laststart=0 for m in resignswithnumbers.finditer(text): #print 'resignswithnumbers:%02d-%02d: %s' % (m.start(), m.end(), m.group(0)) ntoks += [(text[laststart:m.start()],0), (m.group(0).strip(),1)] # #print toks laststart=m.end() ntoks += [(text[laststart:],0)] # #print ntoks if returnMatchInfo: return ntoks else: return [t for t,done in ntoks] def tokenize(line, remultimatch): """ tokenization of line uses remultimatch = compiled list of words ordered by size (bigger first) returns list of tokens """ # prepare line: line=line.replace(u"’","'") respaces=re.compile(ur'\s+') line=respaces.sub(r" ",line) # go! toks=[] for chunk, done in numurltokenize(line,returnMatchInfo=True): if done: toks+=[chunk] else: laststart=0 for m in remultimatch.finditer(chunk): toks += simpletokenize(chunk[laststart:m.start()]) + [m.group(0).strip()] # strip is needed for cases like "etc. !" laststart=m.end() toks+=simpletokenize(chunk[laststart:]) return toks def parseSentenceFile(sentencefile, remultimatch, outfolder=None, memory=None, removePunct=True): """ main function sentencefile: file with one sentence per line remultimatch: compiled re to match multiwords """ if not memory: memory=str(psutil.virtual_memory().available/1000000000)+"G" if not outfolder: outfolder="parses/" print "preparing",sentencefile,"..." emptyConll = emptyFromSentence(sentencefile, remultimatch=remultimatch, outfolder=outfolder) print "parsing",sentencefile,"..." parsedfile=parsing(emptyConll, outfolder=outfolder, memory=memory) print "cleaning",parsedfile,"..." if removePunct: removePuncsFromConllfile(parsedfile) if __name__ == "__main__": parser = argparse.ArgumentParser(description='wrapper of tokenizer and mate parser with French syntactic models') parser.add_argument('-s','--sentence', help='sentence between quotes', type=lambda s: unicode(s, 'utf8'), required=False) parser.add_argument('-f','--sentencesFile', help='file containing one sentence per line', type=lambda s: unicode(s, 'utf8'), required=False) args = vars(parser.parse_args()) ti=time.time() specialCharWords=lireDictionnaires() # prepare multiword matching: multimatch = ur"(( |\b)"+ ur"( |\b)|( |\b)".join([re.escape(mm) for mm in specialCharWords]) +ur"( |\b))" remultimatch=re.compile(multimatch,re.U+re.I) if args.get("sentencesFile",None): parseSentenceFile(args.get("sentencesFile",None),remultimatch=remultimatch) if args.get("sentence",None): with codecs.open("parses/singleSentence.txt","w","utf-8") as singleSentenceFile: singleSentenceFile.write(args.get("sentence",None)+"\n") parseSentenceFile("parses/singleSentence.txt",remultimatch=remultimatch) with codecs.open("parses/singleSentence.parse.conll","r","utf-8") as singleSentenceParse: print singleSentenceParse.read() if verbose: print "it took",time.time()-ti,"seconds"