#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

const int max_sentence_length = 255; // Counted in words
const int max_references=255; //Max amount of reference translations
char *ref_tokens[max_references][max_sentence_length]; //tokens of the reference file
int ref_tokens_count[max_references];
char *tokens[max_sentence_length]; //stores the tokens of the candidate sentence
int pn_numerator[max_references]; // Precision numerator part of the Bleu metric
int pn_denominator[max_references]; // Precision denominator part of the Bleu metric

// options
int verbose=0;
int casesens=0; // Case Sensitivity
int i_sent_nr=0; // ignore sentence numbering... Consider the first token to be a sentence number and ingore this.

int n=1; //max size of the n-gram

int min(int x1,int x2) {
  if (x1<x2) return x1;
  return x2;
}

int max(int x1,int x2) {
  if (x1<x2) return x2;
  return x1;
}

int read_ref_tokens(FILE *fd,int rf) {
// PRE:  fd points to an open file where a reference translation sentence starts
// POST: ref_tokens[rf][] is filled with tokens
//       returns the token count of the sentence
  char *line=(char *)malloc(1024);
  char *linep=line;
  char *token;
  int t=0;
  int firsttoken=1;
   
  while (!(feof(fd)) && (*linep!='*')) {
    fgets(line,1024,fd);
    linep=line;
    while ((*linep!='\0') && (*linep!='*')) {
      while ((*linep!='\0') && !((*linep >='0' && *linep<='9') || (*linep >= 'a' && *linep <= 'z') || (*linep >= 'A' && *linep<='Z') || *linep=='\'')) linep++;
      token=linep;
      while ((*linep!='\0') && ((*linep >='0' && *linep<='9') || (*linep >= 'a' && *linep <= 'z') || (*linep >= 'A' && *linep<='Z') || *linep=='\'')) linep++;
      if (*linep!='\0') {
        if (ref_tokens[rf][t]==0) ref_tokens[rf][t]=(char *)malloc(1024);
        *linep='\0';
        if ((firsttoken==1) && (i_sent_nr==1)) firsttoken=0;
        else {
          strcpy(ref_tokens[rf][t],token);
          t++;
        }
        linep++;
      }
    }
  }
  ref_tokens_count[rf]=t;
  return t;  
}

int ngramcount(char *pngram[max_sentence_length],int nsize,char *sentence[max_sentence_length], int ssize) {
// PRE: pngram[0]..pngram[nsize] are all tokens forming a ngram
//      sentence[0]..sentence[ssize] are all tokens forming a sentence
// RETURN: the amount of pngrams in sentence othwise

  if (nsize > ssize) return 0;
  int i;
  int j;
  int count=0;
  int ncount=0;
  for (i=0;i<(ssize-nsize+1);i++) {
// all n-grams in sentence
    ncount=0;
    for (j=0;j<nsize;j++) {
      if (casesens==0) {
        if (strcasecmp(pngram[j],sentence[i+j])==0) ncount++;
      } else
        if (strcmp(pngram[j],sentence[i+j])==0) ncount++;
      if (ncount==nsize) count++;  //Make sure we have seen the entire n-gram not only a part
    }
  }
  return count;
}

int analyze(int refs,int t) {
// PRE: ref_tokens[x][] is filled for every x<refs
//      t indicates how many candidate tokens are available
//      for every ref_tokens[x][] ref_token_count[x] indicates how may reference tokes are available
// POST: pn_numerator and pn_denominator are updated

  int cn; // current n-gram size
  int i;
  int j;
  int CountClip;

  // for all n-gram sizes:
  for (cn=1;cn<=n;cn++) {
    
    CountClip=0;
    // for all candidate n-grams
    for(i=0;i<(t+1-cn);i++) {
      if (ngramcount(&tokens[i],cn,&tokens[i+1],t-i-1) == 0) { // only if n-gram is uniek in the remainder to prevent double counting
        if (verbose) {
          printf("For N-Gram (");
          for (j=0;j<cn;j++) printf("%s ",tokens[i+j]);
          printf("): ");
        }
        int MaxRefCount=0;
        for (j=0;j<refs;j++) {
          // Take the maximum of all the references
          MaxRefCount=max(MaxRefCount,ngramcount(&tokens[i],cn,ref_tokens[j],ref_tokens_count[j]));
        }
        if (verbose==1) {
          int v=min(MaxRefCount,ngramcount(&tokens[i],cn,tokens,t));
          CountClip+=v;
          printf("%i\n",v);
        } else CountClip+=min(MaxRefCount,ngramcount(&tokens[i],cn,tokens,t));
      }
    }
    if (t+1-cn > 0) {
      pn_numerator[cn-1]+=CountClip;
      pn_denominator[cn-1]+=(t+1-cn);
    }

  }
  return 0;
}

int main(int argc, char *argv[])
{
  int i,j; // loop variables;
  int r,c; // Total length of References and Candidates in tokens;
  r=c=0;
  for (i=0;i<max_references;i++) for (j=0;j<max_sentence_length;j++) {
    ref_tokens[i][j]=0;
  }
  char *cfilename=0; //filename of candidate or to check translations
  char *rfilename[max_references];
  int ref=0; //the number of references

// Processing commandline arguments
  for (i=1;i<argc;i++) {
    
    if (strcmp(argv[i],"-h")==0) {
      fprintf(stdout,"Bleu an evaluation tool for Machine Translation\nWritten by S.Zwarts\n");
      fprintf(stdout,"Based on the paper:\n  Bleu: a Method for Automatic Evaluation of Machine Translation\n  by Papineni K., Roukos S., Ward T., Zhu WJ.\n\n");
      fprintf(stdout,"Usage: Bleu [-v] [-i] [-c] [-n number] candidatefile referencefile1 [referencefile2 ...]\nUse Bleu -h for help\n\n");
      fprintf(stdout,"  -h  Print this message and quit\n");
      fprintf(stdout,"  -v  Turn on verbose mode\n");
      fprintf(stdout,"  -c  Turn on case sensitivity\n");
      fprintf(stdout,"  -n  Must be followed by a positive nonzero integer\n");
      fprintf(stdout,"      Set the maximum n-gram size. Default is 1\n");
      fprintf(stdout,"  -i  Ignore first token in every sentence\n");
      fprintf(stdout,"      For sentence numbering\n");

      return 0;
    }
  
    if (strcmp(argv[i],"-c")==0) {
      casesens=1;
    } else

    if (strcmp(argv[i],"-i")==0) {
      // Assuming Sentences are numbered, ignore first token of every sentence
      i_sent_nr=1;
    } else

// Setting the size of the n-gram
    if (strcmp(argv[i],"-n")==0) {
      if (++i >= argc) {
        fprintf(stderr,"Missing argument for -n\n  Usage -n <integer> - defines the size for n-gram\n");
        return -1;
      } 
      n=atoi(argv[i]);
      if (n <= 0) {
        fprintf(stderr,"%s is an invalide argument for -n\n  Usage -n <integer> - defines the size for n-gram\n",argv[i]);
        return -1;
      }
    } else 

// Turning on verbose mode
    if (strcmp(argv[i],"-v")==0) {
      verbose=1;
    } else {

// Setting Candidate filename
      if (cfilename==0) {
        cfilename=argv[i];
      } else {
        rfilename[ref++]=argv[i];
      }
    }
  }
  if (cfilename==0) {
    fprintf(stderr,"Missing file with Candidate Translations\nUsage: Bleu [-v] [-i] [-c] [-n number] candidatefile referencefile1 [referencefile2 ...]\nUse Bleu -h for help\n");
    return -1;
  }
  if (verbose) {
    printf("size of n-gram: %i\n",n);
    printf("file with candidate translations: %s\n",cfilename);
    printf("consider first token sentence number: %i\n",i_sent_nr);
    printf("#reference files: %i\n",ref);
  }
  FILE *fd=fopen(cfilename,"r");
  FILE *fd_ref[max_references];
  if (fd==NULL) {
    fprintf(stderr,"Error reading %s\n",cfilename);
    return -1;
  }
// Open reference files
  for(i=0;i<max_references;i++) {
    if (i>=ref) fd_ref[i]=NULL;
    else {
      fd_ref[i]=fopen(rfilename[i],"r");
      if (fd_ref[i]==NULL)
        fprintf(stderr,"Error reading %s\n",rfilename[i]);
    }
  }
  // Tokenize the candidate file
  char *line=(char *)malloc(1024);
  char *linep;
  char *token;
  int t=0;
  for (i=0;i<max_sentence_length;i++) tokens[i]=0;

  for (i=0;i<n;i++) {
    // initialize the precision fraction
    pn_numerator[i]=0;
    pn_denominator[i]=0;
  }
  
  int firsttoken=1;
  while (!(feof(fd))) {
    fgets(line,1024,fd);
    linep=line;
    while (*linep!='\0') {
      if (*linep=='*') {
        firsttoken=1;
        while (*linep=='*') linep++;
        for (i=0;i<ref;i++) {
           read_ref_tokens(fd_ref[i],i);
        }
        // Ok now we have the tokens of the source sentence in tokens[] and the tokens of the reference sentence in ref_tokens[][]
        // calculate blue value for this sentence
        analyze(ref,t); 
        c+=t;
        int shortestref=ref_tokens_count[0];
        for (i=1;i<ref;i++) shortestref=min(shortestref,ref_tokens_count[i]);
        r+=shortestref;
        if (verbose==1) printf("===== Next Sentence =====\n");
        t=0;
      } else {
        while ((*linep!='\0') && !((*linep >='0' && *linep<='9') || (*linep >= 'a' && *linep <= 'z') || (*linep >= 'A' && *linep<='Z') || *linep=='\'')) linep++;
        token=linep;
        while ((*linep!='\0') && ((*linep >='0' && *linep<='9') || (*linep >= 'a' && *linep <= 'z') || (*linep >= 'A' && *linep<='Z') || *linep=='\'')) linep++;
        if (*linep!='\0') {
          if (tokens[t]==0) tokens[t]=(char *)malloc(1024);
          *linep='\0';
          if ((firsttoken==1) && (i_sent_nr==1)) firsttoken=0;
          else {
            strcpy(tokens[t],token);
            t++;
          }
          linep++;
        }
      }
    }
  }
  for(i=0;(tokens[i]!=0) && (i < max_sentence_length);free(tokens[i++]));
  for(i=0;i<max_references;i++) if (fd_ref[i]!=NULL) fclose(fd_ref[i]);
  free(line);
  fclose(fd);

  double precision=0;
  if (pn_denominator[n-1] > 0) {
    // calculate precision using uniform weighting;
    int Smoother=0; // Use smoothing technique "Add One" if one n-gram count is zero 
    for(i=0;i<n;i++) if (pn_numerator[i]==0) Smoother=1;
    for(i=0;i<n;i++) {
      precision+=log(((double)pn_numerator[i]+(double)Smoother)/((double)pn_denominator[i]+(double)Smoother)); 
      printf("Precision %i-gram: %f",i+1,(double)pn_numerator[i]/(double)pn_denominator[i]);
      if (Smoother==1) printf(" but used %f because of smoothing\n",((double)pn_numerator[i]+(double)Smoother)/(double)pn_denominator[i]);
      else printf("\n");
    }
    precision=exp(precision/(double)n);
    printf("Weighted Precision: %f\n",precision);
    // Calculate brevity penalty
    double BP=1.0;
    if (c<r) BP=exp((double)1-((double)r/(double)c));
    printf("Brevity Penalty: %f\n",BP);
    // Now calculate the precision in the log domain
    if (Smoother==1) printf("Used \"Add one\" smoothing\n");
    printf("-------------------------\nBleu = %f\n",BP*precision);
  } else {
    fprintf(stderr,"sentences too short to have %i-grams\n",n);
    return -1;
  }
  return 0;
}
