
# Makes an HTML table of all the combining characers in current Unicode table "DerivedName.txt"
# sburke@pobox.com  2024-03-13

use strict;
use warnings;


our $Data_Input_Filespec   = "DerivedName.txt";
our $HTML_Output_Filespec  = "combining_characters_table.html";
our $Shorten_The_Names = 1; #   0 to not shorten them at all
         # Note: Full Names are on the hovers

#--------------------------------------------------------------------------

our( $Header, $Row, $Footer );
our($IN_FH, $OUT_FH); # filehandles
our $Matched_Count = 0;
Main();
exit;

#--------------------------------------------------------------------------

sub Main {

  unless( -e $Data_Input_Filespec and -r $Data_Input_Filespec ) {
    die "I don't see a readable $Data_Input_Filespec file!";
  }
  
  print "Output to $HTML_Output_Filespec\n";
  
  if( -e $HTML_Output_Filespec ) {
    print "  (I will overwrite the existing one.)\n";
  }

  ($Header, $Row, $Footer) = Get_Parts();
  
  open $IN_FH,  "<", $Data_Input_Filespec  or die "Can't read-open $Data_Input_Filespec - $!\n Aborting";
  open $OUT_FH, ">", $HTML_Output_Filespec or die "Can't write-open $HTML_Output_Filespec - $!\n Aborting";

  $Header =~ s/(<body)/<!-- This file is autogenerated by $0 -->\n$1/i;
  print $OUT_FH $Header or die "Can't print to $HTML_Output_Filespec - $!\n Aborting";  # should never happen
  Main_Loop();
  print $OUT_FH $Footer or die "Can't print to $HTML_Output_Filespec - $!\n Aborting";  # should never happen
  close $OUT_FH  or die "Can't close write-channel to $HTML_Output_Filespec - $!\n Aborting"; # should never happen

  close $IN_FH  or die "Can't close read-channel to $Data_Input_Filespec - $!\n Aborting"; # should never happen

  
  print "Done writing to $HTML_Output_Filespec\n";
  print "Made $Matched_Count rows.\n";
  print "(That feels very low)\n" unless $Matched_Count > 30;
  
  return;
}

#--------------------------------------------------------------------------

sub Main_Loop {
    
 Data_Line:
  while( readline( $IN_FH ) ) {
	my($hex, $name);
	next unless $_ =~ qr/\bCOMBINING\b/;
	chomp;

	#Now we parse lines like:   
	# 0302          ; COMBINING CIRCUMFLEX ACCENT
	# 0303          ; COMBINING TILDE
	# 0304          ; COMBINING MACRON
	if( $_ =~  qr/
				   ^
					 ( [0-9a-fA-F]{1,5} )
				     \s+
					 ;
					 \s*
					 (\S+.+)
				   $
				/x
	) {
	  ($hex, $name) = ($1,$2);
	} else {
	  next Data_Line;
	}

	++$Matched_Count;
	
	my $longname = $name;
	
	$name =~ s/\b ACCENT    \b/ /gx   if $Shorten_The_Names;
	$name =~ s/\b COMBINING \b/ /gx   if $Shorten_The_Names;
	
	$name     = titlecase(     $name || "?" );
	$longname = titlecase( $longname || "?" );
	
	print $OUT_FH template_filled_in( $hex, $name, $longname )
	 or die "Can't print to $HTML_Output_Filespec - $!\n Aborting";  # should never happen
  }

  return;
}

#--------------------------------------------------------------------------

sub titlecase {
  my($string) = @_;
#--------------------------------------------------------------------------
  my(@words) =  ( $string =~ m/(\S+)/g );
  foreach my $word (@words) {
	$word = ucfirst( lc( $word ) );
  }
  return join " ", @words;
}

#--------------------------------------------------------------------------

sub template_filled_in {
  my($hex, $name, $longname) = @_;	
  my $t = $Row;
  $t =~ s/AAAA+/$hex/g;
  $t =~ s/NNNN+/$name/g;
  $t =~ s/LLLL+/$longname/g;
  
  return $t;
}

#--------------------------------------------------------------------------

sub Get_Parts {

  #print "Getting parts...\n";
  my $page = join( "", readline( *DATA ) );
  my $header;
  my $row;
  my $footer;
  
  my $re;
  
  $re = regexp_around( "STARTFOOTER" );
  if($page =~ s/$re//ismx) {
	$footer = $2;
  } else {
	die "Couldn't find $re in {{$page}}\nAborting";
  }

  #print "FOOTER={{$footer}}\n";

  $re = regexp_around( "STARTROW" );
  if($page =~ s/$re//ismx) {
	$row = $2;
  } else {
	die "Couldn't find $re in {{$page}}\nAborting";
  }

  $re = regexp_around( "STARTHEADER" );
  if($page =~ s/$re//ismx) {
	$header = $2;
  } else {
	die "Couldn't find $re in {{$page}}\nAborting";
  }
  
  if( $page =~ m/\S/ ) {
	die "There's data before 'STARTHEADER'?!\n{{$page}}\nAborting";
  }
  
  return($header, $row, $footer);
}

sub regexp_around {
  my($text) = @_;
  $text = quotemeta($text);
  return qr/( \<!-- \s* $text [^>A-Z]>*? --\> \s* )(.+)/ismx;
}

__DATA__
<!-- STARTHEADER -->
<!doctype html><html><head>
<meta charset="utf-8" >
<meta name="Description" content="Unicode composed characters" >
<title>Unicode Composed Characters</title>
<link rel="stylesheet" href="./char_table_style.css">
</head>
<body>

<table class="char_table" width="90%"><tbody>
 <tr>
  <th class="char" width="30%">Char</th>
  <th class="charname" width="50%">Name</th>
 </tr>
 
<!-- STARTROW -->
<tr
 ><td class="char" title="U+AAAA: LLLL">&#xAAAA;</td
 ><td class="charname"   >U+AAAA: NNNN</td
></tr>

<!-- STARTFOOTER -->
</tbody></table>

<p style=" text-align: center; margin: 1em;	 ">(End.)</p>

</body></html>
