/*
 * ʸܹ
 *
 * ΥޥɤĤεǽäƤ롣(-cץ)
 * (1) proccorpusη̤ƥȷǷиŪΨɽ
 * (2) ƥȷɽХʥѴ
 *
 * morphological-analyzerνϤˤϲΥޡդƤ
 * ~ θ
 * ! ʸĹθ
 * ^ ʣʸ2Ĥʹߤ
 *
 * generate transition matrix
 *
 * Copyright (C) 2006 HANAOKA Toshiyuki
 * Copyright (C) 2006-2007 TABATA Yusuke
 *
 */
/*
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>

#include <anthy/feature_set.h>
#include <anthy/diclib.h>
#include "input_set.h"

#define FEATURE_SET_SIZE NR_EM_FEATURES

/* ΨΥơ֥ */
struct matrix {
  /* Τ */
  struct input_set *cand_is;
  /* ʸ */
  struct input_set *seg_is;
  /* Ω(Ǥ̤) */
  struct input_set *indep_is;

  /**/
  int nr_sentences;
  int nr_connections;
};

#define ARRAY_SIZE 16

struct array {
  int len;
  int f[ARRAY_SIZE];
};

#define MAX_SEGMENT 64

struct segment_info {
  int orig_hash;
  int hash;
};

struct sentence_info {
  int nr_segments;
  struct segment_info segs[MAX_SEGMENT];
};

static struct matrix *
init_matrix(void)
{
  struct matrix *m;
  m = malloc(sizeof(struct matrix));
  m->seg_is = input_set_create();
  m->cand_is = input_set_create();
  m->indep_is = input_set_create();
  m->nr_sentences = 0;
  m->nr_connections = 0;
  return m;
}

static void
parse_features(struct array *features, char *s)
{
  char *tok, *str = s;
  tok = strtok(str, ",");
  features->len = 0;
  do {
    features->f[features->len] = atoi(tok);
    features->len++;
    tok = strtok(NULL, ",");
  } while(tok);
}

static void
add_seg_struct_info(struct matrix *m,
		    struct array *features,
		    double weight)
{
  input_set_set_features(m->cand_is, features->f, features->len, weight);
}

static void
set_hash(struct sentence_info *sinfo, int error_class,
	 char tag, int hash)
{
  if (tag == '~') {
    sinfo->segs[sinfo->nr_segments].orig_hash = hash;
  } else {
    sinfo->segs[sinfo->nr_segments].hash = hash;
  }
  if (!error_class) {
    sinfo->nr_segments++;
  }
}

static void
parse_indep(struct matrix *m, struct sentence_info *sinfo,
	    char *line, char *buf, int error_class)
{
  struct array features;
  char *s;
  double weight = 1.0;
  /**/
  s = strstr(buf, "features=");
  if (s) {
    s += 9;
    parse_features(&features, s);
    m->nr_connections ++;
  }
  s = strstr(buf, "hash=");
  if (s) {
    s += 5;
    set_hash(sinfo, error_class, line[0], atoi(s));
  }

  /* û */
  if (error_class) {
    if (line[0] == '~') {
      /* θ */
      add_seg_struct_info(m, &features, -weight);
    }
    if (line[0] == '!') {
      /* ʸĹθ */
      input_set_set_features(m->seg_is, features.f, features.len, -weight);
    }
  } else {
    /* ³ */
    input_set_set_features(m->seg_is, features.f, features.len, weight);
    /* ʸι¤ */
    add_seg_struct_info(m, &features, weight);
  }
}

static void
init_sentence_info(struct sentence_info *sinfo)
{
  int i;
  sinfo->nr_segments = 0;
  for (i = 0; i < MAX_SEGMENT; i++) {
    sinfo->segs[i].orig_hash = 0;
    sinfo->segs[i].hash = 0;
  }
}

static void
clear_array(struct array *a)
{
  int i;
  for (i = 0; i < ARRAY_SIZE; i++) {
    a->f[i] = 0;
  }
}

static void
set_features_with_context(struct matrix *m, struct array *a,
			  struct segment_info *ctx, int weight)
{
  struct array arr;
  arr = *a;
  if (ctx) {
    arr.len = 3;
    if (ctx->orig_hash) {
      arr.f[2] = ctx->orig_hash;
    } else {
      return ;
    }
  }
  input_set_set_features(m->indep_is, arr.f, arr.len, weight);
}

static void
add_candidate_context_info(struct matrix *m, struct segment_info *seg,
			   struct segment_info *ctx)
{
  struct array a;
  clear_array(&a);
  a.len = 2;
  /* orig_hashϸѴθ⤷0
   * hashϳꤵ줿
   * [0] 0֤θ
   * [1] ꤵ줿
   */
  if (!seg->hash) {
    return ;
  }
  if (seg->orig_hash == 0) {
    /* 0ܤθ䤬ꤵ줿 */
    a.f[0] = seg->hash;
    a.f[1] = 0;
    set_features_with_context(m, &a, ctx, 1);
    return ;
  }
  /* 0ܰʳθ䤬ꤵ줿 */
  if (seg->orig_hash == seg->hash) {
    /* ʤɤ㤦ѥ󡢼ΩƱ */
    return ;
  }
  a.f[0] = seg->orig_hash;
  a.f[1] = seg->hash;
  set_features_with_context(m, &a, ctx, 1);
  a.f[0] = seg->orig_hash;
  a.f[1] = 0;
  set_features_with_context(m, &a, ctx, -1);
}

static void
complete_sentence_info(struct matrix *m, struct sentence_info *sinfo)
{
  int i, j;
  for (i = 0; i < sinfo->nr_segments; i++) {
    for (j = i - 2; j < i + 3 && j < sinfo->nr_segments; j++) {
      if (j < 0) {
	continue;
      }
      if (i == j) {
	add_candidate_context_info(m, &sinfo->segs[i], NULL);
      } else {
	add_candidate_context_info(m, &sinfo->segs[i], &sinfo->segs[j]);
      }
    }
  }
}

static void
read_morph_file(struct matrix *m, FILE *fp)
{
  char line[1024];
  struct sentence_info sinfo;

  init_sentence_info(&sinfo);

  while (fgets(line, 1024, fp)) {
    char *buf = line;
    int error_class = 0;
    if (!strncmp(buf, "eos", 3)) {
      m->nr_sentences ++;
      complete_sentence_info(m, &sinfo);
      init_sentence_info(&sinfo);
    }
    if (line[0] == '~' || line[0] == '!' ||
	line[0] == '^') {
      buf ++;
      error_class = 1;
    }
    if (!strncmp(buf, "indep_word", 10) ||
	!strncmp(buf, "eos", 3)) {
      parse_indep(m, &sinfo, line, buf, error_class);
    }
  }
}

static void
read_file(struct matrix *m, char *fn)
{
  FILE *ifp;
  ifp = fopen(fn, "r");
  if (!ifp) {
    return ;
  }
  read_morph_file(m, ifp);
  fclose(ifp);
}

static void
write_nl(FILE *fp, int i)
{
  i = anthy_dic_htonl(i);
  fwrite(&i, sizeof(int), 1, fp);
}

static void
dump_line(FILE *ofp, struct input_line *il)
{
  int i;
  for (i = 0; i < FEATURE_SET_SIZE || i < il->nr_features; i++) {
    if (i) {
      fprintf(ofp, ", ");
    }
    if (i < il->nr_features) {
      fprintf(ofp, "%d", il->features[i]);
    } else {
      fprintf(ofp, "0");
    }
  }
  fprintf(ofp,",%d,%d\n", (int)il->negative_weight, (int)il->weight);
}

static int
compare_line(const void *p1, const void *p2)
{
  const struct input_line *const *il1 = p1;
  const struct input_line *const *il2 = p2;
  int i;
  for (i = 0; i < (*il1)->nr_features &&
	 i < (*il2)->nr_features; i++) {
    if ((*il1)->features[i] !=
	(*il2)->features[i]) {
      return (*il1)->features[i] - (*il2)->features[i];
    }
  }
  return (*il1)->nr_features - (*il2)->nr_features;
}

static void
dump_features(FILE *ofp, struct input_set *is)
{
  struct input_line *il, **lines;
  int i, nr = 0;
  int weight = 0;

  /* count lines */
  for (il = input_set_get_input_line(is); il; il = il->next_line) {
    nr ++;
    weight += (int)il->weight;
  }
  /* copy lines */
  lines = malloc(sizeof(struct input_line *) * nr);
  for (il = input_set_get_input_line(is), i = 0; i < nr;
       i++, il = il->next_line) {
    lines[i] = il;
  }
  /* sort */
  qsort(lines, nr, sizeof(struct input_line *), compare_line);
  /* output */
  fprintf(ofp, "%d %d total_line_weight,count\n", weight, nr);
  /**/
  for (i = 0; i < nr; i++) {
    dump_line(ofp, lines[i]);
  }
}

static void
dump_format_t(FILE *ofp, struct matrix *m)
{
  struct input_set *is;
  fprintf(ofp, "section anthy.trans_info ");
  dump_features(ofp, m->seg_is);
  fprintf(ofp, "section anthy.cand_info ");
  dump_features(ofp, m->cand_is);
  fprintf(ofp, "section anthy.reorder_info ");
  is = input_set_filter(m->indep_is, 1, 1);
  dump_features(ofp, is);
}

static void
convert_line(FILE *ofp, char *buf)
{
  char *tok;
  tok = strtok(buf, ",");
  do {
    write_nl(ofp, atoi(tok));
    tok = strtok(NULL, ",");
  } while (tok);
}

static void
convert_file(FILE *ifp)
{
  char buf[1024];
  FILE *ofp = NULL;
  while (fgets(buf, 1024, ifp)) {
    /**/
    if (!strncmp("section", buf, 7)) {
      int w, n, i;
      char fn[1024];
      if (ofp) {
	fclose(ofp);
	ofp = NULL;
      }
      sscanf(buf, "section %s %d %d", fn, &w, &n);
      ofp = fopen(fn, "w");
      if (!ofp) {
	fprintf(stderr, "failed to open (%s)\n", fn);
	abort();
      }
      write_nl(ofp, w);
      write_nl(ofp, n);
      for (i = 0; i < NR_EM_FEATURES; i++) {
	write_nl(ofp, 0);
      }
    } else {
      convert_line(ofp, buf);
    }
  }
  if (ofp) {
    fclose(ofp);
  }
}

static void
convert_data(int nr_fn, char **fns)
{
  FILE *ifp;
  int i;
  /**/
  for (i = 0; i < nr_fn; i++) {
    ifp = fopen(fns[i], "r");
    if (!ifp) {
      fprintf(stderr, "failed to open (%s)\n", fns[i]);
      continue;
    }
    convert_file(ifp);
  }
}

static void
proc_corpus(int nr_fn, char **fns, FILE *ofp)
{
  int i;
  struct matrix *m;
  /**/
  m = init_matrix();
  for (i = 0; i < nr_fn; i++) {
    read_file(m, fns[i]);
  }

  /**/
  dump_format_t(ofp, m);
  /**/
  fprintf(stderr, " %d sentences\n", m->nr_sentences);
  fprintf(stderr, " %d connections\n", m->nr_connections);
  fprintf(stderr, " %d segments\n", m->nr_connections - m->nr_sentences);
}

int
main(int argc, char **argv)
{
  FILE *ofp;
  int i;
  int nr_input = 0;
  char **input_files;
  int convert = 0;

  ofp = NULL;
  input_files = malloc(sizeof(char *) * argc);
  
  for (i = 1; i < argc; i++) {
    char *arg = argv[i];
    if (!strcmp(arg, "-o")) {
      ofp = fopen(argv[i+1], "w");
      if (!ofp) {
	fprintf(stderr, "failed to open (%s)\n", argv[i+1]);
      }
      i ++;
    } else if (!strcmp(arg, "-c")) {
      convert = 1;
    } else {
      input_files[nr_input] = arg;
      nr_input ++;
    }
  }
  if (ofp) {
    proc_corpus(nr_input, input_files, ofp);
    fclose(ofp);
  }
  if (convert) {
    convert_data(nr_input, input_files);
  }


  return 0;
}
