Commit 4d2950fb authored by Jason Rhinelander's avatar Jason Rhinelander
Browse files

Added support for 'UTF8+XXXX', where 'XXXX' is the UTF8 sequence in hex;

Added support for spaces (i.e. shell-escaped) in addition to underscores
parent a45239f5
#!/usr/bin/perl
use utf8;
use bytes();
use strict;
use warnings;
use Getopt::Long qw(:config gnu_getopt);;
......@@ -23,11 +24,15 @@ utf8::is_utf8($_) or utf8::decode($_) or die "Invalid input (not UTF-8): $_\n" f
my $arg = qr{
(
(?:0?x | [Uu]\+?)? [[:xdigit:]]+ (?:_[[:xdigit:]]+)* # Hex, such as: 0x203d, x203d, 0x20_3d, etc. single _'s allowed. Also allows the U+0123 system.
(?:0?x | [Uu]\+?)? [[:xdigit:]]+ (?:[_ ][[:xdigit:]]+)* # Hex, such as: 0x203d, x203d, 0x20_3d, etc. single _'s or spaces [which need to be shell escaped] allowed. Also allows the U+0123 system.
)
|
(
0?b[01]+(?:_[01]+)* # Binary, such as: 0b100000_00111101, b00100000_00111101, b10000000111101, etc. single _'s allowed.
(?:utf)?8\+? [[:xdigit:]]+ (?:[_ ][[:xdigit:]]+)* # Hex such as UTF8+E2_80_BD or 8+42, representing a UTF8 representation
)
|
(
0?b[01]+(?:[_ ][01]+)* # Binary, such as: 0b100000_00111101, b00100000_00111101, b10000000111101, etc. single _'s allowed.
)
|
(.)
......@@ -126,7 +131,7 @@ if ($opts{search} or $opts{regex}) {
my @chars;
for (@ARGV) {
if (/^($arg)(?:-($arg))?\z/) {
my ($char, $to) = ($1, $5);
my ($char, $to) = ($1, $6);
push @chars, defined $to
? (codepoint($char) .. codepoint($to))
: codepoint($char);
......@@ -243,8 +248,16 @@ sub bytes {
sub codepoint {
my $value = shift;
$value =~ /^$arg$/ or die "Cannot compute codepoint of `$value'";
my ($hex, $bin, $chr) = ($1, $2, $3);
my ($hex, $utf8_code, $bin, $chr) = ($1, $2, $3);
$hex =~ s/^(?:U\+?|0?x)//i if defined $hex;
if (defined $utf8_code) {
$utf8_code =~ s/.*\+//;
(my $c = $utf8_code) =~ y/ _//d;
my $bytes = join '', map bytes::chr(hex $_), split /(?=(?:[[:xdigit:]]{2})+$)/, $c;
utf8::decode($bytes) or die "Invalid input: 'UTF8+$utf8_code' does not appear to be a valid UTF-8 sequence\n";
length($bytes) == 1 or die "Invalid input: 'UTF8+$utf8_code' decodes to multiple characters\n";
return ord $bytes;
}
return defined $hex ? hex($hex) : defined $bin ? oct($bin) : ord $chr
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment