Commit dd7d9f37 authored by Jason Rhinelander's avatar Jason Rhinelander
Browse files

- Added block/subscript information

- Added decomposition, when it exists, including looking up the decomposed character names, and printing out
	the decomposed version (which should be visually indistinguishable, but can be copied)
- Added unicode version to --help output
- Allowed a single character to be prefixed with _ so that _e can be used to show U+0065 instead of U+000E
- Changed the regexp matching a single character to \X instead of . so that it matches a single grapheme
	cluster instead of just a single codepoint, thus allow decomposed character encodings to work properly
- Print out an error if you try to do x-y where one of x or y is a multi-codepoint composition
	(i.e. a grapheme cluster), since a range makes no sense in that case.
parent 6ccbaca3
......@@ -30,16 +30,16 @@ my $arg = qr{
(
(?:utf)?8\+? [0-9a-fA-F]+ (?:[_ ][0-9a-fA-F]+)* # Hex such as UTF8+E2_80_BD or 8+42, representing a UTF8 representation
)
|
(
[D0]\+ \d+ (?:[_ ]\d+)* # Decimal, such as 0+8253 or D+82_53
)
|
|
(
[D0]\+ \d+ (?:[_ ]\d+)* # Decimal, such as 0+8253 or D+82_53
)
|
(
0?b[01]+(?:[_ ][01]+)* # Binary, such as: 0b100000_00111101, b00100000_00111101, b10000000111101, etc. single _'s allowed.
)
|
(.)
_?(\X) # Any old unicode character (or character composition), prefixable with _ (so that _e show U+0065 instead of U+000E)
}six;
my %special;
......@@ -136,6 +136,27 @@ my @chars;
for (@ARGV) {
if (/^($arg)(?:-($arg))?\z/) {
my ($char, $to) = ($1, $7);
if (defined $to and (
$6 and length($6) > 1
or
$12 and length($12) > 1
)) {
# $6 or $12 is a literal character, but if its length is greater than 1 its a composed character,
# and in such case a range does not make any sense, so error out
my $error = "Invalid range $_ specified: ";
if ($6 and length($6) > 1) {
$error .= "$6";
if ($12 and length($12) > 1) {
$error .= " and $12 are both multiple codepoint composed character sequences!";
}
else {
$error .= " is a composed character sequence!";
}
}
else {
$error .= "$12 is a composed character sequence!";
}
}
push @chars, defined $to
? (codepoint($char) .. codepoint($to))
: codepoint($char);
......@@ -166,6 +187,22 @@ if ($opts{details}) {
$binary = reverse $binary;
my $uname = $uinfo->{name} || '(unknown)';
my $ublock = $uinfo->{block} || '(unknown)';
my $uscript = $uinfo->{script} || '(unknown)';
my $udecomp = "none";
if (my $d = $uinfo->{decomposition}) {
$udecomp = '';
# The decomposition info can start with something like '<compat> ' indicating the
# decomposition type, so copy that out:
$udecomp .= "$1 " if $d =~ s/^(<[^>]*>)\s+//;
# The rest is 4+ digit hex strings:
my @decomp = split ' ', $d;
$udecomp .= join ' ', map {
my $info = charinfo(hex);
my $name = ($info and $info->{name} or '(unknown)');
"U+\U$_\E ($name)";
} @decomp;
$udecomp .= " (" . join('', map chr hex, @decomp) . ")";
}
my $upper = $uinfo->{upper} ? "\n${underline}Upper-case:$reset U+$uinfo->{upper} (" . chr(hex $uinfo->{upper}) . ")\n" : '';
my $lower = $uinfo->{lower} ? "\n${underline}Lower-case:$reset U+$uinfo->{lower} (" . chr(hex $uinfo->{lower}) . ")\n" : '';
my $char_disp = $special{$_} || chr;
......@@ -175,7 +212,10 @@ if ($opts{details}) {
└─@{["" x length $char_disp]}─┘
${bold}U+$uinfo->{code} $uname$reset
${underline}Unicode category:$reset $ublock
${underline}Unicode properties:$reset
Block: $ublock
Script: $uscript
Decomposition: $udecomp
${underline}Codepoint:$reset %1\$d (decimal) 0x%1\$x (hex) 0%1\$o (octal) 0b$binary (binary)
......@@ -203,9 +243,7 @@ else {
my $i = -1;
for (@chars) {
$max[++$col] = [map length $header{$_}, @formats] if ++$i % $rows == 0;
#warn "formats: " . @formats . " vs. chars: " . @$_;
for my $i (0 .. $#$_) {
#warn "max[$col]->[$i] = $max[$col]->[$i]";
$max[$col]->[$i] = length($_->[$i]) if length($_->[$i]) > $max[$col]->[$i];
}
}
......@@ -225,11 +263,11 @@ else {
for my $col (0 .. $#max) {
push @tpl, join(' ', map { '%' . ($formats[$_] eq 'c' ? '-' : '') . "$max[$col][$_]s" } 0 .. $#formats);
}
unless (@formats == 1 and length($header{$formats[0]}) == 1) {
# A single-letter header just looks wrong when there is only
# one column, so just suppress it.
print join(' ', map sprintf($_, map $header{$_}, @formats), @tpl), "\n";
}
unless (@formats == 1 and length($header{$formats[0]}) == 1) {
# A single-letter header just looks wrong when there is only
# one column, so just suppress it.
print join(' ', map sprintf($_, map $header{$_}, @formats), @tpl), "\n";
}
for my $row (0 .. $rows-1) {
my @cells;
......@@ -258,7 +296,7 @@ sub codepoint {
$value =~ /^$arg$/ or die "Cannot compute codepoint of `$value'";
my ($hex, $utf8_code, $decimal, $bin, $chr) = ($1, $2, $3, $4, $5);
$hex =~ s/^(?:U\+?|0?x)//i if defined $hex;
$decimal =~ s/^[0D]\+//i if defined $decimal;
$decimal =~ s/^[0D]\+//i if defined $decimal;
if (defined $utf8_code) {
$utf8_code =~ s/.*\+//;
(my $c = $utf8_code) =~ y/ _//d;
......@@ -267,6 +305,9 @@ sub codepoint {
length($bytes) == 1 or die "Invalid input: 'UTF8+$utf8_code' decodes to multiple characters\n";
return ord $bytes;
}
elsif (defined $chr and length($chr) > 1) {
return map ord, split '', $chr;
}
return defined $hex ? hex($hex) : defined $decimal ? 0+$decimal : defined $bin ? oct($bin) : ord $chr
}
......@@ -278,7 +319,9 @@ Displays UTF-8 characters from FROM to TO, or from each character in the list
provided. FROM, TO and CHARACTER can be hexadecimal numbers (optionally
beginning with 0x or the unicode-style U+ prefix), binary numbers (beginning
with 0b), decimal numbers prefixed with "0+" or "D+", or single UTF-8
characters. If no FROM-TO pairs nor CHARACTERs are specified, characters from
characters, optionally prefixed with an underscore (required for literal
characters 0-9, a-f, and A-F, to avoid being interpreted as a hexademical
character). If no FROM-TO pairs nor CHARACTERs are specified, characters from
0 to 255 are displayed.
Recognized options:
......@@ -310,6 +353,9 @@ Under --nodetails mode:
-b Alias for --format=b,c
-1 Don't put results in columns
Current unicode database version: @{[Unicode::UCD::UnicodeVersion]}
USAGE
}
#vim:et
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment