#!/usr/bin/python import sys,string; MaxPhraseLength=3 # Function to calculate Phrases from a Matrix # We start with a certain word and try to build all the possible # Phrases given this starting point # some of the Phrases are still invalid because they don't use all # aligned words, so we have to filter that later def ExpandPhrase(x,y,n,m,down,up): # We have five possible directions to expand our phrase # +---+---+ # | 0 | 1 | # +---+---+ # | x | 2 | # +---+---+ # | 4 | 3 | # +---+---+ # if (n+1-x) > MaxPhraseLength: return [] if abs(y-m)+1 > MaxPhraseLength: return [] if (ym: Valid=0 if Valid==1: for column in NewMatrix[:x]+NewMatrix[n+1:]: for element in range(y,m+1): if element in column: Valid=0 # check for other horizontal alignments if Valid==1: result+=[(x,n,y,m)] return result if (len(sys.argv)<2 or (sys.argv[1]=="-h" or sys.argv[1]=="--help" or sys.argv[1]=="-?")): print " PhraseBuilder v1.0" print " Simon Zwarts Feb-2006" print "" print "This script builds the phrase table for the called Phrase" print "based Statistical Machine Translation tools" print "As input we need to have the alignment files of GIZA++" print "One trained in one direction and one alignment file of a" print "training in the other direction." print "" print "Phrases of length 1 are not generated because this is already" print "a direction output of GIZA++ word probability file" print "Maximum Phrase Length is "+str(MaxPhraseLength)+". Longer Phrases are ignored." print "" print "The direction of the Phrase table is based on the direction" print "of the first alignment file." print "" if (len(sys.argv)<3): print "Usage: " print " PhraseBuilder.py -?" print " * shows this help message" print "" print " PhraseBuilder.py " print " * runs the script over these files" print "" sys.exit(0) try: fd1=open(sys.argv[1],'r+') except: print "Error opening file \""+sys.argv[1]+"\"" sys.exit(-1) try: fd2=open(sys.argv[2],'r+') except: print "Error opening file \""+sys.argv[2]+"\"" sys.exit(-1) PhraseCount=dict() eof=0 while (eof==0): # Read lines TargetLine1=fd1.readline() while TargetLine1[:1]=="#": TargetLine1=fd1.readline() if TargetLine1=="": eof=1 continue SourceLine1=fd1.readline() while SourceLine1[:1]=="#": SourceLine1=fd1.readline() if SourceLine1=="": eof=1 continue TargetLine2=fd2.readline() while TargetLine2[:1]=="#": TargetLine2=fd2.readline() if TargetLine2=="": eof=1 continue SourceLine2=fd2.readline() while SourceLine2[:1]=="#": SourceLine2=fd2.readline() if SourceLine2=="": eof=1 continue # now build matrices for both alignments i=0 Matrix1=[] for x in string.split(SourceLine1," })"): Matrix1+=[[]] z=string.find(x,"({")+3 NumberList=string.split(x[z:]," ") NewNumberList=[] for Number in NumberList: if Number=="": NewNumberList+=[] else: try: NewNumberList+=[int(Number)-1] except: if (z<3): print "Input files might be not consistent." print "Expected a line with ({ }) markers, but found:" print SourceLine1 print "" print "Sometimes Giza++ adds a badly placed comment marker on the previous line" print "Check if previous line is a genuine line starting with a # and remove the #" print "Error occured in",sys.argv[1] else: print "Oops..." print "An error occured, this shouldn't happen" print "Check if files are correct alignment files" print "Error occured around line:" print SourceLine1 sys.exit(-1) Matrix1[i]=NewNumberList i+=1 Matrix1=Matrix1[1:len(Matrix1)-1] i=0 Matrix2=[] Max2=0 for x in string.split(SourceLine2," })"): Matrix2+=[[]] z=string.find(x,"({")+3 NumberList=string.split(x[z:]," ") NewNumberList=[] for Number in NumberList: if Number=="": NewNumberList+=[] else: try: Number=int(Number) except: if (z<3): print "Input files might be not consistent." print "Expected a line with ({ }) markers, but found:" print SourceLine2 print "" print "Sometimes Giza++ adds a badly placed comment marker on the previous line" print "Check if previous line is a genuine line starting with a # and remove the #" print "Error occured in",sys.argv[2] else: print "Oops..." print "An error occured, this shouldn't happen" print "Check if files are correct alignment files" print "Error occured around line:" print SourceLine2 sys.exit(-1) if Number>Max2: Max2=Number NewNumberList+=[Number] Matrix2[i]=NewNumberList i+=1 Matrix2=Matrix2[1:len(Matrix2)-1] # we have to Mirror Matrix2 over the diagonal NewMatrix=[] for x in range(Max2): NewMatrix+=[[]] for y in range(len(Matrix2)): if x+1 in Matrix2[y]: NewMatrix[x]+=[y] Matrix2=NewMatrix # now get agreement between matrices, calculate intersection between both matrices NewMatrix=[] for x in range(len(Matrix1)): NewMatrix+=[[]] for y in Matrix1[x]: if y in Matrix2[x]: NewMatrix[x]+=[y] # now calculate the XOR. These are possible points to add InterSect=NewMatrix NewMatrix=[] for x in range(len(Matrix1)): NewMatrix+=[[]] for y in Matrix1[x]: if not y in Matrix2[x]: NewMatrix[x]+=[y] for y in Matrix2[x]: if not y in Matrix1[x]: NewMatrix[x]+=[y] Xor=NewMatrix # Now try to look which from the Xor we add # First we add all adjecent alignments NewMatrix=InterSect AddedPoints=0 Continue=True # start from bottom right while Continue: for x in range(len(Xor)-1,-1,-1): for y in Xor[x]: # check if it aligns at least one un aligneword align=0 if not y in NewMatrix[x]: align=1 if NewMatrix[x]==[]: align=1 else: for z in NewMatrix: if y in z: align=0 if align==1: # check for direct adjecent alignment points Add=0 if ((y+1) in NewMatrix[x]) or ((y-1) in NewMatrix[x]): Add=1 elif (x+10) and (y in NewMatrix[x-1]): Add=1 if Add==1: NewMatrix[x]+=[y] AddedPoints+=1 NewMatrix[x].sort() Continue=(AddedPoints>0) AddedPoints=0 # now add all non adjecent points connecting completely unaligned words for x in range(len(Xor)-1,-1,-1): for y in Xor[x]: align=0 if not y in NewMatrix[x]: align=1 if NewMatrix[x]==[]: align=1 else: for z in NewMatrix: if y in z: align=0 if align==1: NewMatrix[x]+=[y] NewMatrix[x].sort() # NewMatrix holds now the desired alignment, read off the Phrases # NewMatrix=[[0],[1,2],[3],[3],[3],[4],[4],[6],[5]] # TargetLine1="Maria no daba una botefada a la bruja verde" # TargetLine2="Mary did not slap the green witch" # print "" # print string.strip(TargetLine1) # print string.strip(TargetLine2) TargetWords=string.split(string.strip(TargetLine1)," ") SourceWords=string.split(string.strip(TargetLine2)," ") rowx=0 for x in NewMatrix: for y in x: PossiblePhrases=ExpandPhrase(rowx,y,rowx,y,False,True) PossiblePhrases=list(set(PossiblePhrases)) # get rid of doubles for (x,n,y,m) in CleanPhrases(PossiblePhrases): # printstring="\"" # for w in SourceWords[x:n+1]: # printstring+=w+" " # printstring=printstring[:len(printstring)-1]+"\" \"" # for w in TargetWords[y:m+1]: # printstring+=w+" " # printstring=printstring[:len(printstring)-1]+"\"" # print printstring SourcePhrase=string.join(SourceWords[x:n+1]," ") TargetPhrase=string.join(TargetWords[y:m+1]," ") if PhraseCount.has_key(SourcePhrase): x=PhraseCount[SourcePhrase] NewList=[] found=0 for (Phrase,Count) in x: if Phrase==TargetPhrase: NewList+=[(TargetPhrase,Count+1)] found=1 else: NewList+=[(Phrase,Count)] if found==0: NewList+=[(TargetPhrase,1)] PhraseCount[SourcePhrase]=NewList else: PhraseCount[SourcePhrase]=[(TargetPhrase,1)] # print SourceWords[x:n+1],TargetWords[y:m+1] rowx+=1 for SourcePhrase in PhraseCount.keys(): Sum=0 for (Phrase,Count) in PhraseCount[SourcePhrase]: Sum+=Count for (Phrase,Count) in PhraseCount[SourcePhrase]: print Phrase,"|||",SourcePhrase,"|||",float(Count)/Sum fd1.close() fd2.close()