Commit da9b0142 authored by Jason Rhinelander's avatar Jason Rhinelander
Browse files

Added utf8 script

parent 9650fbc3
#!/usr/bin/perl
use utf8;
use strict;
use warnings;
use Getopt::Long qw(:config gnu_getopt);;
use POSIX qw/ceil/;
use Term::ReadKey qw/GetTerminalSize/;
use Term::ANSIColor;
use Unicode::UCD qw/charinfo/;
use Encode;
Unicode::UCD::openunicode(\my $unicode_fh, 'UnicodeData.txt');
my ($term_cols, $term_lines) = GetTerminalSize();
my $bold = color 'bold';
my $reverse = color 'reverse';
my $underline = color 'underline';
my $reset = color 'reset';
binmode STDOUT, ":utf8";
utf8::is_utf8($_) or utf8::decode($_) or die "Invalid input (not UTF-8): $_\n" for @ARGV;
my $arg = qr{
(
(?:0?x | [Uu]\+?)? [[:xdigit:]]+ (?:_[[:xdigit:]]+)* # Hex, such as: 0x203d, x203d, 0x20_3d, etc. single _'s allowed. Also allows the U+0123 system.
)
|
(
0?b[01]+(?:_[01]+)* # Binary, such as: 0b100000_00111101, b00100000_00111101, b10000000111101, etc. single _'s allowed.
)
|
(.)
}six;
my %special;
while (<$unicode_fh>) {
my ($hcp, $name, $special_name) = (split /;/, $_, 12)[0, 1, 10];
my $cp = hex $hcp;
if ($cp >= 0xA0) {
seek $unicode_fh, 0, 0;
last;
}
$special{$cp} = $special_name if $name eq '<control>';
}
my %format = (
d => "%d",
x => "%x", X => '%X', '0x' => "0x%x", '0X' => "0x%X",
u => "%04X",
o => "%o",
b => "%b", '0b' => '0b%b',
xb => sub { bytes(encode('utf8', chr shift), '', '%02x', '') },
'0xb' => sub { bytes(encode('utf8', chr shift), '0x', '%02x', '') },
Xb => sub { bytes(encode('utf8', chr shift), '', '%02X', '') },
'0Xb' => sub { bytes(encode('utf8', chr shift), '0x', '%02X', '') },
bb => sub { bytes(encode('utf8', chr shift), '', '%08b', '_') },
'0bb' => sub { bytes(encode('utf8', chr shift), '0b', '%08b', '_') },
c => sub { my $c = shift; $special{$c} || chr($c) }
);
my %header = (
d => "Dec",
x => "Hex", X => "Hex", '0x' => "Hex", '0X' => "Hex",
u => 'U+',
o => 'Oct',
b => 'Binary', '0b' => 'Binary',
xb => 'HexB', '0xb' => 'HexB', Xb => 'HexB', '0xb' => 'HexB',
bb => 'BinaryB', '0bb' => 'BinaryB',
c => 'C'
);
my %opts = (format => 'x,c', details => -1);
my $error;
GetOptions(
'help|h|?' => \$opts{help},
'details!' => \$opts{details},
'format|f=s' => \$opts{format},
'decimal|d' => sub { $opts{format} = 'd,c' },
'hex|x' => sub { $opts{format} = 'x,c' },
'octal|o' => sub { $opts{format} = 'o,c' },
'binary|b' => sub { $opts{format} = 'b,c' },
'unicode|u' => sub { $opts{format} = 'u,c' },
'bytes' => \$opts{bytes},
'nocolumns|1' => \$opts{nocols},
'search|s=s' => \$opts{search},
'regexp|r=s' => \$opts{regex},
) or $error++;
my @formats = split /,+/, $opts{format};
for (@formats) {
unless (exists $format{$_}) {
warn "Invalid format: `$_'\n";
$error++;
}
}
if ($opts{search} or $opts{regex}) {
my $regex = $opts{regex} && eval { qr/$opts{regex}/i };
my $search = $opts{search} && lc $opts{search};
if ($@) {
$@ =~ s/at \Q$0\E line \d+\.$//;
die "Invalid regular expression: $@\n";
}
while (<$unicode_fh>) {
my ($hcp, $name) = split /;/;
if ($opts{search} ? index(lc($name), $search) >= 0 : $name =~ /$regex/) {
my $uinfo = charinfo(hex $hcp);
print $bold . chr(hex $uinfo->{code}) . qq{: U+$uinfo->{code}$reset "$uinfo->{name}"\n\n};
}
}
exit;
}
@ARGV = ('0-255') if not @ARGV;
my @chars;
for (@ARGV) {
if (/^($arg)(?:-($arg))?\z/) {
my ($char, $to) = ($1, $5);
push @chars, defined $to
? (codepoint($char) .. codepoint($to))
: codepoint($char);
$opts{details} = 0 if defined $to and $opts{details} and $opts{details} == -1;
}
else {
warn "Invalid input: `$_'\n";
$error++;
}
}
if ($error or $opts{help}) {
print "\n";
usage();
exit $error ? 1 : 0;
}
if ($opts{details}) {
for (@chars) {
my $uinfo = charinfo($_);
if (!$uinfo) {
printf "${bold}U+%04X$reset is not a valid unicode character.\n\n", $_;
next;
}
my $binary = reverse sprintf "%b", $_;
$binary =~ s/([01]{8})(?=[01])/$1_/g;
$binary = reverse $binary;
my $uname = $uinfo->{name} || '(unknown)';
my $ublock = $uinfo->{block} || '(unknown)';
my $upper = $uinfo->{upper} ? "\n${underline}Upper-case:$reset U+$uinfo->{upper} (" . chr(hex $uinfo->{upper}) . ")\n" : '';
my $lower = $uinfo->{lower} ? "\n${underline}Lower-case:$reset U+$uinfo->{lower} (" . chr(hex $uinfo->{lower}) . ")\n" : '';
my $char_disp = $special{$_} || chr;
printf <<DETAILS, $_;
┌─@{["" x length $char_disp]}─┐
$bold$char_disp$reset
└─@{["" x length $char_disp]}─┘
${bold}U+$uinfo->{code} $uname$reset
${underline}Unicode category:$reset $ublock
${underline}Codepoint:$reset %1\$d (decimal) 0x%1\$x (hex) 0%1\$o (octal) 0b$binary (binary)
${underline}Representations:$reset
UTF-8: @{[bytes(encode('utf8', chr), '', '%02X', ' ')]} (hex) @{[bytes(encode('utf8', chr), '', '%08b', ' ')]} (binary)
UTF-16: @{[bytes(encode('utf16-be', chr), '0x', '%02X', '')]} (hex) @{[bytes(encode('utf16-be', chr), '0b', '%08b', '_')]} (binary)
C-style octal escaped UTF-8: @{[bytes(encode('utf8', chr), '\\', '%03o', '\\')]}
XML decimal entity: &#x%1\$x;
$upper$lower
DETAILS
}
}
else {
for my $char (@chars) {
$char = [map { ref $format{$_} eq 'CODE' ? $format{$_}->($char) : sprintf($format{$_}, $char) } @formats];
}
my $rows;
my @max;
for (my $cols = int max(1, min(($term_cols || 80) / (@formats * 2 - 1), @chars / 16)); $cols > 0; $cols--) {
@max = ();
$rows = ceil(@chars / $cols);
my $col = -1;
my $i = -1;
for (@chars) {
$max[++$col] = [map length $header{$_}, @formats] if ++$i % $rows == 0;
#warn "formats: " . @formats . " vs. chars: " . @$_;
for my $i (0 .. $#$_) {
#warn "max[$col]->[$i] = $max[$col]->[$i]";
$max[$col]->[$i] = length($_->[$i]) if length($_->[$i]) > $max[$col]->[$i];
}
}
my $width = 0;
for (0 .. $#max) {
$width += 2 if $_; # Account for 2 spaces between columns
$width += @formats-1; # Account for the spaces between sub-elements
$width += $_ for @{$max[$_]};
}
last if $width <= ($term_cols || 80);
}
# formats: d x c
# max: [4 5 6] [4 5 4] [6 7 8] ...
my @tpl;
for my $col (0 .. $#max) {
push @tpl, join(' ', map { '%' . ($formats[$_] eq 'c' ? '-' : '') . "$max[$col][$_]s" } 0 .. $#formats);
}
print join(' ', map sprintf($_, map $header{$_}, @formats), @tpl), "\n";
for my $row (0 .. $rows-1) {
my @cells;
for my $col (0 .. $#max) {
my $i = $row + $col * $rows;
push @cells, sprintf $tpl[$col], @{$chars[$i]} if $chars[$i];
}
print join(' ', @cells), "\n";
}
}
sub max { $_[0] > $_[1] ? $_[0] : $_[1] }
sub min { $_[0] < $_[1] ? $_[0] : $_[1] }
sub bytes {
my ($bytes, $prefix, $byte_format, $join) = @_;
# utf8::encode(my $copy = $char);
return $prefix . join($join, map sprintf($byte_format, ord), split //, $bytes);
}
sub codepoint {
my $value = shift;
$value =~ /^$arg$/ or die "Cannot compute codepoint of `$value'";
my ($hex, $bin, $chr) = ($1, $2, $3);
$hex =~ s/^(?:U\+?|0?x)//i if defined $hex;
return defined $hex ? hex($hex) : defined $bin ? oct($bin) : ord $chr
}
sub usage {
print <<USAGE;
Usage: $0 [OPTION]... [FROM-TO]... [CHARACTER]...
Displays UTF-8 characters from FROM to TO, or from each character in the list
provided. FROM, TO and CHARACTER can be hexadecimal numbers (optionally
beginning with 0x or the unicode-style U+ prefix), binary numbers (beginning
with 0b), or single UTF-8 characters. If no FROM-TO pairs nor CHARACTERs are
specified, characters from 0 to 255 are displayed.
Recognized options:
-h, -?, --help This help screen.
--[no]details Display [or not] character details [Default: display
details if no FROM-TO sequences specified]
-s, --search="name" Searches for characters with unicode names containing
"name". Search is case-insensitive.
-r, --regex="regex" Like --search, but uses the supplied perl regular
expression.
Under --nodetails mode:
-f, --format=FORMAT Display character list in the specified FORMATs.
Formats are comma-separated; supported formats:
d = decimal, x = hexadecimal, o = octal, b = binary
xb, bb = hexadecimal/binary UTF-8 byte representation
0x, 0b, 0o, 0xb, 0bb = prepends 0x, 0b, or 0.
X, Xb, 0X, 0Xb - use upper-case A..F
u - use U+12AB style hex formatting
c - the character (in UTF-8)
[Default: d,x,c]
-d Alias for --format=d,c
-x Alias for --format=x,c
-X Alias for --format=X,c
-u Alias for --format=u,c
-o Alias for --format=o,c
-b Alias for --format=b,c
-1 Don't put results in columns
USAGE
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment