#!/usr/bin/perl
require 5 || exit;
use strict;

#  Sean M. Burke, sburke@cpan.org, 2003
#  Modify the source as you like/want/need.
#    desc{     makes SQL-table-creation code from Unihan.txt    }

@ARGV == 2 or die(
"unihan2sql - makes SQL-table-creation code from Unihan.txt
Usage:
   unihan2sql Unihan.txt Unihan.sql
 or:
   unihan2sql Unihan.txt -
      To send the SQL code to STDOUT
");

my($In, $Out) = @ARGV;
die "What input?" unless $In and -f $In and -r _ and -s _;

my $Table_name = 'unihan';
my $Charnum_field_name = 'handec';  # han number, as decimal
my $Record_count = 0;

my @Unihan_keys;
my %Native2sqlname; # mostly just take out the _'s
my %Native2sqltype;

$Native2sqlname{'Definition'} = 'def';
# "definition" is apparently a reserved word in SQL


scan_for_field_names();
open_output();
create_table();
data2sql();
byebye();
exit;

#--------------------------------------------------------------------
sub byebye {
  print join "\n",
    '', 
    "unlock tables;",
    "/* $Record_count records */",
    '',
  ;
  print STDERR "$0: Done after ", time() - $^T,
    " seconds for $Record_count records.\n",
  ;
}

#--------------------------------------------------------------------------

sub scan_for_field_names {
  # Build @Unihan_keys and %Native2sqlname and %Native2sqltype
  
  open_input();
  
  my(%max_length, %is_non_numeric, %order_rank_hint);
  
  print STDERR "$0: Scanning $In...\n";
  
  my($k,$v);
  while(<IN>) {
    if( ($k,$v) =  # Normal case: a non-comment non-blank line
      m/^
          U\+[a-fA-F0-9]{4,6}  # character number
          \t
          k(\S+)        # key name
          \t
          ([^\cm\cj\t]+) # data value
       /sx
    ) {
      if( $is_non_numeric{$k} ) {
        $max_length{$k} = length($v) if length($v) > ($max_length{$k} || 0);
      } else {
        if( $v =~ m/^\d{1,8}$/s ) {
          $is_non_numeric{$k} = '';
        } else {
          #print "$k $v is non-numeric in $_";
          $is_non_numeric{$k} = 1;
          $max_length{$k} = length($v) if length($v) > ($max_length{$k} || 0);
        }
      }

    } elsif( m/^\s*#/ ) { # comment lines
      $order_rank_hint{$1} ||= $. if  m/^#\tk([a-zA-Z0-9_]+)/s;
       # This is a bit of a hack to capture the "canonical" field
       # Ordering from the initial comments in the file

    } elsif(!m/\S/) {
      # do nothing for blank lines

    } else { # an aberrant line
      chomp;
      warn "$0: What kind of line is \xAB$_\xBB?\n in $In line $.\n";
    }
  }
  close(IN);
  
  @Unihan_keys = keys %{{ %is_non_numeric, %max_length }};
  my $sqlname;
  
  foreach my $k (@Unihan_keys) {
    $Native2sqltype{$k} ||=
        (!$is_non_numeric{$k}) ? 'int'
      : !$max_length{$k} ? "varchar(255)" # shouldn't happen
      : ($max_length{$k} > 255) ? "long varchar"
      : "varchar($max_length{$k})"
    ;
    $order_rank_hint{$k} ||= 99999;
    $sqlname = lc($k);
    $sqlname =~ tr/a-z0-9//cd; # leave only alphanumeric characters
    $sqlname =~ s/^(\d)/x$1/s; # just in case that leaves us with
                 # a field named like "123", make it "x123".
    $Native2sqlname{$k} ||= $sqlname;
  }

  # Now leave Unihan_keys smartly sorted:
  @Unihan_keys = sort {
      $order_rank_hint{$a} <=> $order_rank_hint{$b}
      or lc($a) cmp lc($b)
  } @Unihan_keys;

  #print STDERR "$0: Keys: @Unihan_keys\n=>\n@{[map $Native2sqlname{$_}, @Unihan_keys]}\n\n";

  reopen_input();
}

#--------------------------------------------------------------------

sub data2sql {
  print STDERR "$0: Generating SQL...\n";

  my(@keys, @values); # for this character
  my($charnum, $k, $v);  # for this line
  my $lastcharnum = '';
 
  while(<IN>) {
    next unless
      ( $charnum, $k, $v) =
      m/^
        U\+([a-fA-F0-9]{4,6})  # character number
        \t
        k(\S+)        # key name
        \t
        ([^\cm\cj\t]+) # data value
       /sx
    ;
    emit(\@keys, \@values, $lastcharnum) unless $charnum eq $lastcharnum;
    $lastcharnum = $charnum;
    push @keys, $k;
    push @values, $v;
  }
  emit(\@keys, \@values, $charnum);
  close(IN);
}

sub emit {
  my($klist, $vlist, $charnumhex) = @_;
  return unless length($charnumhex) and @$klist and @$vlist;

  foreach my $v (@$vlist) { $v =~ s/"/\\"/g } # turn quotes into escape-quotes
  ++$Record_count;

  print join '',
    "insert into $Table_name (",
    join(q{, },
      $Charnum_field_name,
      map $Native2sqlname{$_}, splice @$klist
    ),
    ")\n\tvalues (",
    join( q{, },  map qq{"$_"},   hex($charnumhex), splice @$vlist),
    ");\n",
  ;
  return;
  
}

#--------------------------------------------------------------------

sub create_table {
  my $table_name = 'unihan';

  print join "\n",
    "/*",
    "\t SQL generated by unihan2sql by sburke\@cpan.org",
    sprintf("\t on %s \n\t from %s (%s b long, mod %s)",
      scalar(localtime),
      $In,
      -s $In,
      scalar(localtime( (stat($In))[9] ))
    ),
    " */",
    '',
    "drop table if exists $table_name;",
    "create table $table_name (", 
    join(",\n",
      "\t $Charnum_field_name int(10) unsigned not null primary key",
      map sprintf( "\t %s %s",
          $Native2sqlname{$_},
          $Native2sqltype{$_} || 'varchar(255)',
        ),
        @Unihan_keys,
    ),
    ");",
    "lock tables $table_name write;",
    '',
    '',
  ;
}

#--------------------------------------------------------------------

sub open_output {
  if($Out eq '-') {
    # OK, fine.
  } else {
    open OUT, ">$Out" or die "Can't write-open $Out: $!";
    select(OUT);
  }
  return;
}

#--------------------------------------------------------------------

sub reopen_input {
  open(IN, $In) or die "Can't read-open $In: $!";
  binmode(IN);
  return;
}

#--------------------------------------------------------------------

sub open_input {  # open to IN and adjust newline format
  my $x;
  reopen_input($In);
  read(IN, $_, 2000) or die "Can't read from $In: $!";
  close(IN);
  $/ =
    m/((?:\cm\cj+)|\cm|\cj)/s
     ? $1   # yay, nice newline format
     : die "What newline format is $In in?!"  # should never happen
  ;
  
  reopen_input();
}

#--------------------------------------------------------------------

__END__

