# Makes an HTML table of all the combining characers in current Unicode table "DerivedName.txt" # sburke@pobox.com 2024-03-13 use strict; use warnings; our $Data_Input_Filespec = "DerivedName.txt"; our $HTML_Output_Filespec = "combining_characters_table.html"; our $Shorten_The_Names = 1; # 0 to not shorten them at all # Note: Full Names are on the hovers #-------------------------------------------------------------------------- our( $Header, $Row, $Footer ); our($IN_FH, $OUT_FH); # filehandles our $Matched_Count = 0; Main(); exit; #-------------------------------------------------------------------------- sub Main { unless( -e $Data_Input_Filespec and -r $Data_Input_Filespec ) { die "I don't see a readable $Data_Input_Filespec file!"; } print "Output to $HTML_Output_Filespec\n"; if( -e $HTML_Output_Filespec ) { print " (I will overwrite the existing one.)\n"; } ($Header, $Row, $Footer) = Get_Parts(); open $IN_FH, "<", $Data_Input_Filespec or die "Can't read-open $Data_Input_Filespec - $!\n Aborting"; open $OUT_FH, ">", $HTML_Output_Filespec or die "Can't write-open $HTML_Output_Filespec - $!\n Aborting"; $Header =~ s/(\n$1/i; print $OUT_FH $Header or die "Can't print to $HTML_Output_Filespec - $!\n Aborting"; # should never happen Main_Loop(); print $OUT_FH $Footer or die "Can't print to $HTML_Output_Filespec - $!\n Aborting"; # should never happen close $OUT_FH or die "Can't close write-channel to $HTML_Output_Filespec - $!\n Aborting"; # should never happen close $IN_FH or die "Can't close read-channel to $Data_Input_Filespec - $!\n Aborting"; # should never happen print "Done writing to $HTML_Output_Filespec\n"; print "Made $Matched_Count rows.\n"; print "(That feels very low)\n" unless $Matched_Count > 30; return; } #-------------------------------------------------------------------------- sub Main_Loop { Data_Line: while( readline( $IN_FH ) ) { my($hex, $name); next unless $_ =~ qr/\bCOMBINING\b/; chomp; #Now we parse lines like: # 0302 ; COMBINING CIRCUMFLEX ACCENT # 0303 ; COMBINING TILDE # 0304 ; COMBINING MACRON if( $_ =~ qr/ ^ ( [0-9a-fA-F]{1,5} ) \s+ ; \s* (\S+.+) $ /x ) { ($hex, $name) = ($1,$2); } else { next Data_Line; } ++$Matched_Count; my $longname = $name; $name =~ s/\b ACCENT \b/ /gx if $Shorten_The_Names; $name =~ s/\b COMBINING \b/ /gx if $Shorten_The_Names; $name = titlecase( $name || "?" ); $longname = titlecase( $longname || "?" ); print $OUT_FH template_filled_in( $hex, $name, $longname ) or die "Can't print to $HTML_Output_Filespec - $!\n Aborting"; # should never happen } return; } #-------------------------------------------------------------------------- sub titlecase { my($string) = @_; #-------------------------------------------------------------------------- my(@words) = ( $string =~ m/(\S+)/g ); foreach my $word (@words) { $word = ucfirst( lc( $word ) ); } return join " ", @words; } #-------------------------------------------------------------------------- sub template_filled_in { my($hex, $name, $longname) = @_; my $t = $Row; $t =~ s/AAAA+/$hex/g; $t =~ s/NNNN+/$name/g; $t =~ s/LLLL+/$longname/g; return $t; } #-------------------------------------------------------------------------- sub Get_Parts { #print "Getting parts...\n"; my $page = join( "", readline( *DATA ) ); my $header; my $row; my $footer; my $re; $re = regexp_around( "STARTFOOTER" ); if($page =~ s/$re//ismx) { $footer = $2; } else { die "Couldn't find $re in {{$page}}\nAborting"; } #print "FOOTER={{$footer}}\n"; $re = regexp_around( "STARTROW" ); if($page =~ s/$re//ismx) { $row = $2; } else { die "Couldn't find $re in {{$page}}\nAborting"; } $re = regexp_around( "STARTHEADER" ); if($page =~ s/$re//ismx) { $header = $2; } else { die "Couldn't find $re in {{$page}}\nAborting"; } if( $page =~ m/\S/ ) { die "There's data before 'STARTHEADER'?!\n{{$page}}\nAborting"; } return($header, $row, $footer); } sub regexp_around { my($text) = @_; $text = quotemeta($text); return qr/( \ Unicode Composed Characters
Char Name
U+AAAA: NNNN

(End.)