Commit 6cdfc017 authored by Jason Rhinelander's avatar Jason Rhinelander
Browse files

utf8 script overhaul

- No longer depends on Perl's Unicode::UCD for information: instead a
  new script builds a unicode.data file from the UCD XML file.
- Character aliases and abbreviations are now supported for searching
  and are displayed
- Searching now accepts multiple search terms (or regexes), all of which
  must match (any of which if using new --or argument)
- Control character abbreviations now come out of the unicode data
  instead of being hard coded.
- Added scripts to download and extract the data from the latest unicode
  specification.
- Various character attributes are now reported
- The new data file (unicode.data.gz) built by extract.pl is designed to
  work compressed, reducing the storage space required.
- Added the unicode version in which a character was added
- Small cosmetic tweaks
- Rewrote --help output
- Fixed bugs:
  - UTF-16 display was flat out wrong for 4 byte UTF-16 characters
  - --details didn't do anything when searching
  - searching never found control characters
parent 85f2cb0c
# Don't commit this; regen.sh will re-download as needed
PropertyValueAliases.txt.gz
ucd.all.flat.xml.gz
This diff is collapsed.
#!/usr/bin/perl
# Extraction tool: extracts the fields utf8 wants from a full unicode database XML file
use strict;
use warnings;
use XML::LibXML::Reader;
use FindBin;
use IO::Compress::Gzip;
die <<USAGE if @ARGV;
Usage: $0 regenerates unicode.data.gz from ucd.all.flat.xml and
PropertyValueAliases.txt (both optionally gzipped) in the same directory as
extract.pl, outputting the results to unicode.data.gz, ready for use by utf8.
This is typically invoked indirectly by running ./regen.sh which downloads the
latest UCD files and invokes this script.
USAGE
my $ucd = "$FindBin::RealBin/ucd.all.flat.xml";
$ucd = "$ucd.gz" if not -e $ucd and -e "$ucd.gz";
die "Cannot find ucd.all.flat.xml(.gz) in $FindBin::RealBin\n" if not -e $ucd;
my $unicode = XML::LibXML::Reader->new(
location => $ucd
);
if ($unicode->nextElement("description") < 1) {
die "Invalid/unsupported unicode XML data: <description> not found\n";
}
my $unicode_description = $unicode->readInnerXml();
my $unicode_version;
if ($unicode_description =~ /^Unicode (\d+(?:\.\d+)+)$/) {
$unicode_version = $1;
}
else {
die "UCD XML format unexpected: did not find <description>Unicode x.y.z</description> element\n";
}
# Position the reader at the first character element
if ($unicode->nextElement("repertoire") < 1) {
die "Invalid unicode XML data: <repertoire> not found\n";
}
# Read value aliases from the PVA file:
my $pvatxt = "$FindBin::RealBin/PropertyValueAliases.txt";
my $pva;
if (-e $pvatxt) {
open $pva, "<", $pvatxt
or die "Cannot open $pvatxt: $!\n";
}
elsif (-e "$pvatxt.gz") {
require IO::Uncompress::Gunzip;
$pva = IO::Uncompress::Gunzip->new("$pvatxt.gz")
or die "Opening $pvatxt.gz failed: $IO::Uncompress::Gunzip::GunzipError\n";
$IO::Uncompress::Gunzip::GunzipError if 0; # Silence "only used once" warning
}
else {
die "Cannot find PropertyValueAliases.txt(.gz) in $FindBin::RealBin\n";
}
if (<$pva> !~ /# PropertyValueAliases-(\d+(?:\.\d+)+)\.txt/) {
die "Invalid/unexpected PropertyValueAliases file: version not found on first line\n";
}
elsif ($1 ne $unicode_version) {
die "Version mismatch: UCD is from unicode $unicode_version but PropertyValueAliases is from unicode $1\n";
}
my %ucd_aliases;
my $fieldre = qr/\s*([^\s;](?:[^;]*[^\s;])?)\s*/;
while (<$pva>) {
if (/^$fieldre;$fieldre;$fieldre(?:;|$)/) {
$ucd_aliases{$1}{lc $2} = $3 =~ tr/_/ /r
unless lc($2) eq 'n/a';
}
}
# Sanity check:
if (not exists $ucd_aliases{dt}{can} or $ucd_aliases{dt}{can} ne 'Canonical') {
die "PVA error: didn't find dt[can] = Canonical; either the file is invalid, or it is too new (or too old) for this script";
}
# Some extra changes:
delete $ucd_aliases{age};
$ucd_aliases{dt}{none} = '';
$ucd_aliases{dt}{can} = '';
my %fieldmap = (
# 'nice name' => '<char> attribute name'
codepoint => 'cp',
name => 'na',
alt_names => 'N/A', # tab separated
abbreviations => 'N/A', # abbreviations (if any)
version => 'age',
general_category => 'gc',
block => 'blk',
script => 'sc',
decomposition => 'dm',
decomp_type => 'dt',
uppercase => 'uc',
lowercase => 'lc',
properties => 'N/A',
);
# The first 4 of these aren't allowed to change; the order of the rest isn't guaranteed:
my @fields = qw(
codepoint name alt_names abbreviations
version general_category block script decomposition decomp_type
uppercase lowercase properties
);
for (@fields) { die "Internal error: %fieldmap entry for '$_' missing" if not exists $fieldmap{$_}; }
my $output = \*STDOUT;
print $output $unicode_description, "\n";
print $output join(";", @fields), "\n";
$unicode->nextElement("char");
while () {
my %char;
for my $f (@fields) {
my $ucdf = $fieldmap{$f};
my $attrib = $ucdf eq 'N/A' ? '' : $unicode->getAttribute($ucdf);
$char{$f} = (defined $attrib and $attrib ne "#")
? ($ucd_aliases{$ucdf}{lc $attrib} // $attrib)
: "";
}
if (not defined $char{codepoint} or not length $char{codepoint}) {
if (my $fcp = $unicode->getAttribute("first-cp") and
my $lcp = $unicode->getAttribute("last-cp")) {
$char{codepoint} = "$fcp-$lcp";
}
else {
# Otherwise ignore it
}
}
my @props;
push @props, "whitespace" if $unicode->getAttribute("WSpace") eq 'Y';
push @props, "punctuation" if $unicode->getAttribute("Term") eq 'Y';
push @props, "quotation mark" if $unicode->getAttribute("QMark") eq 'Y';
my $num_type = $unicode->getAttribute("nt");
if ($num_type eq 'De') { push @props, "numeric decimal"; }
elsif ($num_type eq 'Di') { push @props, "numeric digit"; }
elsif ($num_type eq 'Nu') { push @props, "numeric"; }
push @props, "mathematical" if $unicode->getAttribute("Math") eq 'Y';
push @props, "ideographic" if $unicode->getAttribute("Ideo") eq 'Y';
$char{properties} = join ',', @props;
if (not $unicode->isEmptyElement()) {
my $node = $unicode->copyCurrentNode(1);
if ($node->hasChildNodes()) {
for my $c ($node->childNodes) {
if ($c->localname and $c->localname eq 'name-alias') {
my ($name, $abbrev);
for my $attr ($c->attributes) {
if ($attr->localname eq 'alias') {
$name = $attr->value;
}
elsif ($attr->localname eq 'type' and $attr->value eq 'abbreviation') {
$abbrev = 1;
}
}
my $k = $abbrev ? "abbreviations" : "alt_names";
$char{$k} .= "\t" if $char{$k};
$char{$k} .= $name;
}
}
}
}
print $output join(";", @char{@fields}), "\n";
}
continue {
last if $unicode->nextSiblingElement("char") < 1;
}
#!/bin/bash
set -e
rm -f {ucd.all.flat.xml,PropertyValueAliases.txt,unicode.data}{,.gz}
echo -n "Downloading, extracting, and compressing latest UCD xml data... "
wget -qO- ftp://ftp.unicode.org/Public/UCD/latest/ucdxml/ucd.all.flat.zip | funzip | gzip -9 >ucd.all.flat.xml.gz
echo "done."
echo -n "Downloading, extracting, and compressing latest PropertyValueAliases.txt... "
wget -qO- ftp://ftp.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt | gzip -9 >PropertyValueAliases.txt.gz
echo "done."
echo -n "Regenerating unicode.data.gz... "
./extract.pl | gzip -9 >unicode.data.gz
echo "done."
File added
......@@ -8,18 +8,185 @@ use Getopt::Long qw(:config gnu_getopt);;
use POSIX qw/ceil/;
use Term::ReadKey qw/GetTerminalSize/;
use Term::ANSIColor;
use Unicode::UCD qw/charinfo/;
use XML::LibXML::Reader;
use Encode;
use FindBin;
open my $unicode_fh, '<', "$FindBin::RealBin/UnicodeData.txt";
use constant {
U_NAME => 0,
U_ALTNAMES => 1,
U_ABBREV => 2,
};
my ($term_cols, $term_lines) = GetTerminalSize();
my $bold = color 'bold';
my $reverse = color 'reverse';
my $underline = color 'underline';
my $reset = color 'reset';
my $unicode_data = "$FindBin::RealBin/unicode.data";
my $unicode_fh;
if (-e $unicode_data) {
open $unicode_fh, '<:utf8', "$FindBin::RealBin/unicode.data"
or die "Cannot locate or access unicode.data\n";
}
elsif (-e "$unicode_data.gz") {
require IO::Uncompress::Gunzip;
$unicode_fh = IO::Uncompress::Gunzip->new("$unicode_data.gz")
or die "Opening $unicode_data.gz failed: $IO::Uncompress::Gunzip::GunzipError\n";
$IO::Uncompress::Gunzip::GunzipError if 0; # Silence "only used once" warning
}
else {
die "Cannot find unicode.data(.gz) in $FindBin::RealBin; perhaps you need to run extract.pl?\n";
}
$unicode_data .= ".gz" if not -e $unicode_data and -e "$unicode_data.gz";
chomp(my $unicode_description = <$unicode_fh>);
# This hash maps fields to positions, the array maps positions to fields
my (%unicode_field, @unicode_field);
{
chomp(my $fields = <$unicode_fh>);
$fields =~ s/^codepoint;//;
my $i = 0;
for (split /;/, $fields) {
$unicode_field[$i] = $_;
$unicode_field{$_} = $i++;
}
}
my (@char_cache, $char_cache_done);
sub char_info {
my $want_cp = shift; # -1 should read everything and return the last one
if ($char_cache_done and ($want_cp > $#char_cache or not $char_cache[$want_cp])) {
return undef;
}
elsif ($want_cp >= 0 and $want_cp <= $#char_cache) {
return $char_cache[$want_cp];
}
while (my $line = <$unicode_fh>) {
my ($cp_start, $cp_end, $rest) = $line =~ /^([[:xdigit:]]{4,})(?:-([[:xdigit:]]{4,}))?;(.*)/;
my $data = [split /;/, $rest];
for my $list_field (U_ALTNAMES, U_ABBREV) {
$data->[$list_field] = length $data->[$list_field] ? [split /\t/, $data->[$list_field]] : undef;
}
my $start = hex($cp_start);
my $end = $cp_end ? hex($cp_end) : $start;
for my $cp ($start .. $end) {
$char_cache[$cp] = $data;
}
last if $want_cp >= 0 and $end >= $want_cp;
}
$char_cache_done = 1 if eof($unicode_fh);
return $char_cache[$want_cp];
}
sub decomp_elaborate {
my @decomp = split ' ', shift;
return join('', map chr hex, @decomp) . " [" . join(', ', map {
my $info = char_info(hex);
my $name = ($info and $info->[U_NAME] or '(unknown)');
"U+\U$_\E ($name)";
} @decomp) . "]";
}
my %format = (
d => "%d",
x => "%x", X => '%X', '0x' => "0x%x", '0X' => "0x%X",
u => "%04X",
o => "%o",
b => "%b", '0b' => '0b%b',
xb => sub { bytes(encode('utf8', chr shift), '', '%02x', '') },
'0xb' => sub { bytes(encode('utf8', chr shift), '0x', '%02x', '') },
Xb => sub { bytes(encode('utf8', chr shift), '', '%02X', '') },
'0Xb' => sub { bytes(encode('utf8', chr shift), '0x', '%02X', '') },
bb => sub { bytes(encode('utf8', chr shift), '', '%08b', '_') },
'0bb' => sub { bytes(encode('utf8', chr shift), '0b', '%08b', '_') },
c => sub { my $c = shift; my $info = char_info($c); $info->[$unicode_field{general_category}] eq 'Control' and $info->[U_ABBREV] and $info->[U_ABBREV][0] or chr($c) },
n => sub { my $c = shift; my $info = char_info($c); $info->[U_NAME] or ($info->[U_ALTNAMES] and $info->[U_ALTNAMES][0]) or "(unknown)" }
);
sub print_details {
my $cp = shift;
my $uinfo = char_info($cp);
if (!$uinfo) {
printf "${bold}U+%04X$reset is not a valid unicode character.\n\n", $cp;
return;
}
my $binary = reverse sprintf "%b", $cp;
$binary =~ s/([01]{8})(?=[01])/$1_/g;
$binary = reverse $binary;
my $uname = $format{n}($cp);
my $uscript = $uinfo->[$unicode_field{script}] || '(unknown)';
my $ublock = $uinfo->[$unicode_field{block}] || '(unknown)';
my $ucat = $uinfo->[$unicode_field{general_category}] || '(unknown)';
my $uversion = $uinfo->[$unicode_field{version}] || '(unknown)';
my $uprops = $uinfo->[$unicode_field{properties}] || '';
$uprops = "\n Extra attributes: " . join ", ", map ucfirst, split ',', $uprops
if $uprops;
my @altnames;
# Slightly tricky: we report the first alias as the character name for nameless characters,
# so don't include the first alias if the character has no name
if ($uinfo->[U_ALTNAMES]) {
if ($uinfo->[U_NAME]) {
push @altnames, @{$uinfo->[U_ALTNAMES]};
}
else {
push @altnames, @{$uinfo->[U_ALTNAMES]}[1 .. $#{$uinfo->[U_ALTNAMES]}];
}
}
my $ualiases = @altnames ? "\n Aliases: " . join "; ", @altnames : "";
my $uabbrev = $uinfo->[U_ABBREV] ? "\n Aliases (abbreviations): " . join "; ", @{$uinfo->[U_ABBREV]} : "";
my $udecomp = '';
if (my $d = $uinfo->[$unicode_field{decomposition}]) {
$udecomp = "\n Decomposition";
my $decomp_type = $uinfo->[$unicode_field{decomp_type}];
if ($decomp_type) {
$udecomp .= " ($decomp_type)";
# Won't have a decomp_type value for a canonical decomposition,
# or for "none" (but that won't have a decomposition value at all)
}
$udecomp .= ": ";
$udecomp .= decomp_elaborate($d);
}
my $upper = $uinfo->[$unicode_field{uppercase}] ? "\n Upper-case: " . decomp_elaborate($uinfo->[$unicode_field{uppercase}]) : '';
my $lower = $uinfo->[$unicode_field{lowercase}] ? "\n Lower-case: " . decomp_elaborate($uinfo->[$unicode_field{lowercase}]) : '';
my $alternatives = ($udecomp or $upper or $lower) ? "\n${underline}Alternatives:$reset$udecomp$upper$lower\n" : "";
my $chr = chr($cp);
my $char_disp = $format{c}($cp);
printf <<DETAILS, $cp;
┌─@{["" x length $char_disp]}─┐
$bold$char_disp$reset
└─@{["" x length $char_disp]}─┘
${bold}U+%1\$04X: $uname$reset
${underline}Unicode properties:$reset
Script: $uscript
Block: $ublock
General category: $ucat
Unicode version: $uversion${
uprops}${
ualiases}${
uabbrev}
$alternatives
${underline}Codepoint:$reset %1\$d (decimal) 0x%1\$x (hex) 0%1\$o (octal) 0b$binary (binary)
${underline}Representations:$reset
UTF-8: @{[chars(encode('utf8', $chr), 1, '0x%02X', ' ')]} (hex) @{[chars(encode('utf8', $chr), 1, '0b%08b', ' ')]} (binary)
UTF-16: @{[chars(encode('utf16-be', $chr), 2, '0x%04X', ' ')]} (hex)
C-style octal escaped UTF-8: @{[chars(encode('utf8', $chr), 1, '\\%03o', '')]}
XML character entity: &#x%1\$x;
DETAILS
}
my ($term_cols, $term_lines) = GetTerminalSize();
binmode STDOUT, ":utf8";
utf8::is_utf8($_) or utf8::decode($_) or die "Invalid input (not UTF-8): $_\n" for @ARGV;
......@@ -39,37 +206,6 @@ my $arg = qr{
_?(\X) # Any old unicode character (or character composition), prefixable with _ (so that _e show U+0065 instead of U+000E)
}six;
my %special;
while (<$unicode_fh>) {
my ($hcp, $name, $special_name) = (split /;/, $_, 12)[0, 1, 10];
my $cp = hex $hcp;
if ($cp >= 0xA0) {
seek $unicode_fh, 0, 0;
last;
}
$special{$cp} = $special_name if $name eq '<control>';
}
my %short_special;
@short_special{0 .. 31, 127 .. 159} = qw{
NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US DEL
PAD HOP BPH NBH IND NEL SSA ESA HTS HTJ VTS PLD PLU RI SS2 SS3 DCS PU1 PU2 STS CCH MW SPA EPA SOS SGCI SCI CSI ST OSC PM APC
};
my %format = (
d => "%d",
x => "%x", X => '%X', '0x' => "0x%x", '0X' => "0x%X",
u => "%04X",
o => "%o",
b => "%b", '0b' => '0b%b',
xb => sub { bytes(encode('utf8', chr shift), '', '%02x', '') },
'0xb' => sub { bytes(encode('utf8', chr shift), '0x', '%02x', '') },
Xb => sub { bytes(encode('utf8', chr shift), '', '%02X', '') },
'0Xb' => sub { bytes(encode('utf8', chr shift), '0x', '%02X', '') },
bb => sub { bytes(encode('utf8', chr shift), '', '%08b', '_') },
'0bb' => sub { bytes(encode('utf8', chr shift), '0b', '%08b', '_') },
c => sub { my $c = shift; $short_special{$c} || chr($c) },
n => sub { my $c = shift; my $info = charinfo($c); $info->{name} || "(unknown)" }
);
......@@ -101,8 +237,9 @@ GetOptions(
'unicode|u' => sub { $opts{format} = 'u,c' },
'bytes' => \$opts{bytes},
'nocolumns|1' => \$opts{nocols},
'search|s=s' => \$opts{search},
'regexp|r=s' => \$opts{regex},
'search|s' => \$opts{search},
'regexp|r' => \$opts{regex},
'or' => \$opts{or},
) or $error++;
my @formats = split /,+/, $opts{format};
......@@ -113,24 +250,66 @@ for (@formats) {
}
}
if ($opts{search} and $opts{regex}) {
warn "Both --search and --regex provided; --search will be ignored (all arguments treated as regex values)\n";
}
if ($opts{search} or $opts{regex}) {
my $regex = $opts{regex} && eval { qr/$opts{regex}/i };
my $search = $opts{search} && lc $opts{search};
if ($@) {
$@ =~ s/at \Q$0\E line \d+\.$//;
die "Invalid regular expression: $@\n";
die "Error: " . ($opts{search} ? "--search" : "--regex") . " requires pattern arguments\n"
if not @ARGV;
my @search;
for (@ARGV) {
if ($opts{regex}) {
my $regex = eval { qr/$_/i };
if ($@) {
$@ =~ s/at \Q$0\E line \d+\.$//;
die "Invalid regular expression: $@\n";
}
push @search, $regex;
}
else {
push @search, qr/\Q$_\E/i;
}
}
while (<$unicode_fh>) {
my ($hcp, $name) = split /;/;
if ($opts{search} ? index(lc($name), $search) >= 0 : $name =~ /$regex/) {
my $uinfo = charinfo(hex $hcp);
print $bold . chr(hex $uinfo->{code}) . qq{: U+$uinfo->{code}$reset "$uinfo->{name}"\n\n};
for (my $i = 0; not $char_cache_done or $i < @char_cache; $i++) {
my $d = char_info($i);
next if not $d;
my @names;
push @names, $d->[U_NAME] if length $d->[U_NAME];
push @names, @{$d->[U_ALTNAMES]} if $d->[U_ALTNAMES];
push @names, @{$d->[U_ABBREV]} if $d->[U_ABBREV];
next unless @names;
my $match;
for my $name (@names) {
$match = 0;
PATTERN: for my $pat (@search) {
if ($name =~ /$pat/) {
$match = 1;
last PATTERN if $opts{or};
}
elsif (not $opts{or}) {
$match = 0;
last PATTERN;
}
}
last if $match;
}
if ($match) {
if ($opts{details} < 1) {
printf "%s%s: U+%04X%s %s\n\n",
$bold, $format{c}($i), $i, $reset, join('/', map qq{"$_"}, @names);
}
else {
# --details mode
print_details($i);
}
}
}
exit;
}
@ARGV = ('00-ff') if not @ARGV;
my @chars;
......@@ -171,56 +350,7 @@ if ($error or $opts{help}) {
if ($opts{details}) {
for (@chars) {
my $uinfo = charinfo($_);
if (!$uinfo) {
printf "${bold}U+%04X$reset is not a valid unicode character.\n\n", $_;
next;
}
my $binary = reverse sprintf "%b", $_;
$binary =~ s/([01]{8})(?=[01])/$1_/g;
$binary = reverse $binary;
my $uname = $uinfo->{name} || '(unknown)';
my $ublock = $uinfo->{block} || '(unknown)';
my $uscript = $uinfo->{script} || '(unknown)';
my $udecomp = "none";
if (my $d = $uinfo->{decomposition}) {
$udecomp = '';
# The decomposition info can start with something like '<compat> ' indicating the
# decomposition type, so copy that out:
$udecomp .= "$1 " if $d =~ s/^(<[^>]*>)\s+//;
# The rest is 4+ digit hex strings:
my @decomp = split ' ', $d;
$udecomp .= join ' ', map {
my $info = charinfo(hex);
my $name = ($info and $info->{name} or '(unknown)');
"U+\U$_\E ($name)";
} @decomp;
$udecomp .= " (" . join('', map chr hex, @decomp) . ")";
}
my $upper = $uinfo->{upper} ? "\n${underline}Upper-case:$reset U+$uinfo->{upper} (" . chr(hex $uinfo->{upper}) . ")\n" : '';
my $lower = $uinfo->{lower} ? "\n${underline}Lower-case:$reset U+$uinfo->{lower} (" . chr(hex $uinfo->{lower}) . ")\n" : '';
my $char_disp = $special{$_} || chr;
printf <<DETAILS, $_;
┌─@{["" x length $char_disp]}─┐
$bold$char_disp$reset
└─@{["" x length $char_disp]}─┘
${bold}U+$uinfo->{code} $uname$reset
${underline}Unicode properties:$reset
Block: $ublock
Script: $uscript
Decomposition: $udecomp
${underline}Codepoint:$reset %1\$d (decimal) 0x%1\$x (hex) 0%1\$o (octal) 0b$binary (binary)
${underline}Representations:$reset
UTF-8: @{[bytes(encode('utf8', chr), '', '%02X', ' ')]} (hex) @{[bytes(encode('utf8', chr), '', '%08b', ' ')]} (binary)
UTF-16: @{[bytes(encode('utf16-be', chr), '0x', '%02X', '')]} (hex) @{[bytes(encode('utf16-be', chr), '0b', '%08b', '_')]} (binary)
C-style octal escaped UTF-8: @{[bytes(encode('utf8', chr), '\\', '%03o', '\\')]}
XML character entity: &#x%1\$x;
$upper$lower
DETAILS
print_details($_);
}
}
else {
......@@ -279,10 +409,25 @@ else {
sub max { $_[0] > $_[1] ? $_[0] : $_[1] }
sub min { $_[0] < $_[1] ? $_[0] : $_[1] }
sub bytes {
my ($bytes, $prefix, $byte_format, $join) = @_;
# utf8::encode(my $copy = $char);
return $prefix . join($join, map sprintf($byte_format, ord), split //, $bytes);
# Decomposes characters into n-byte pieces; the input bytes should be in big-endian order (if $char_size != 1)
sub chars {
my ($be_bytes, $char_size, $char_format, $join) = @_;
die "Invalid chars() call: byte string is not a multiple of char_size"
if length($be_bytes) % $char_size != 0;
my $char_val = 0;
my @chars;
for (my $i = 0; $i < length($be_bytes); $i++) {
my $val = ord(substr($be_bytes, $i, 1));
$char_val <<= 8;
$char_val += $val;
if ($i % $char_size == $char_size-1) {
# At the end of a character
push @chars, $char_val;
$char_val = 0;
}
}
return join $join, map sprintf($char_format, $_), @chars;
}
sub codepoint {
......@@ -307,28 +452,76 @@ sub codepoint {
sub usage {
print <<USAGE;
Usage: $0 [OPTION]... [FROM-TO]... [CHARACTER]...