Commit a72992fa authored by Jason Rhinelander's avatar Jason Rhinelander
Browse files

Added a nounihan database by default.

The full (unihan-included) database can be loaded by specifying the -H
option.
parent cbf132d8
# Don't commit this; regen.sh will re-download as needed
PropertyValueAliases.txt.gz
ucd.all.flat.xml.gz
ucd.nounihan.flat.xml.gz
......@@ -7,6 +7,12 @@ use XML::LibXML::Reader;
use FindBin;
use IO::Compress::Gzip;
my $nounihan = 0;
if (@ARGV == 1 and $ARGV[0] eq '--nounihan') {
$nounihan = 1;
shift;
}
die <<USAGE if @ARGV;
Usage: $0 regenerates unicode.data.gz from ucd.all.flat.xml and
......@@ -16,11 +22,15 @@ extract.pl, outputting the results to unicode.data.gz, ready for use by utf8.
This is typically invoked indirectly by running ./regen.sh which downloads the
latest UCD files and invokes this script.
The only supported option is --nounihan, which loads data from
ucd.nounihan.flat.xml instead of ucd.all.flat.xml.
USAGE
my $ucd = "$FindBin::RealBin/ucd.all.flat.xml";
my $ucd_base = "ucd." . ($nounihan ? "nounihan" : "all") . ".flat.xml";
my $ucd = "$FindBin::RealBin/$ucd_base";
$ucd = "$ucd.gz" if not -e $ucd and -e "$ucd.gz";
die "Cannot find ucd.all.flat.xml(.gz) in $FindBin::RealBin\n" if not -e $ucd;
die "Cannot find $ucd_base(.gz) in $FindBin::RealBin\n" if not -e $ucd;
my $unicode = XML::LibXML::Reader->new(
location => $ucd
......
......@@ -4,10 +4,14 @@ set -e
rm -f {ucd.all.flat.xml,PropertyValueAliases.txt,unicode.data}{,.gz}
echo -n "Downloading, extracting, and compressing latest UCD xml data... "
echo -n "Downloading, extracting, and compressing latest UCD xml full data... "
wget -qO- ftp://ftp.unicode.org/Public/UCD/latest/ucdxml/ucd.all.flat.zip | funzip | gzip -9 >ucd.all.flat.xml.gz
echo "done."
echo -n "Downloading, extracting, and compressing latest UCD xml nounihan data... "
wget -qO- ftp://ftp.unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.flat.zip | funzip | gzip -9 >ucd.nounihan.flat.xml.gz
echo "done."
echo -n "Downloading, extracting, and compressing latest PropertyValueAliases.txt... "
wget -qO- ftp://ftp.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt | gzip -9 >PropertyValueAliases.txt.gz
echo "done."
......@@ -16,3 +20,7 @@ echo -n "Regenerating unicode.data.gz... "
./extract.pl | gzip -9 >unicode.data.gz
echo "done."
echo -n "Regenerating unicode-nounihan.data.gz... "
./extract.pl --nounihan | gzip -9 >unicode-nounihan.data.gz
echo "done."
No preview for this file type
......@@ -22,37 +22,7 @@ my $reverse = color 'reverse';
my $underline = color 'underline';
my $reset = color 'reset';
my $unicode_data = "$FindBin::RealBin/unicode.data";
my $unicode_fh;
if (-e $unicode_data) {
open $unicode_fh, '<:utf8', "$FindBin::RealBin/unicode.data"
or die "Cannot locate or access unicode.data\n";
}
elsif (-e "$unicode_data.gz") {
require IO::Uncompress::Gunzip;
$unicode_fh = IO::Uncompress::Gunzip->new("$unicode_data.gz")
or die "Opening $unicode_data.gz failed: $IO::Uncompress::Gunzip::GunzipError\n";
$IO::Uncompress::Gunzip::GunzipError if 0; # Silence "only used once" warning
}
else {
die "Cannot find unicode.data(.gz) in $FindBin::RealBin; perhaps you need to run extract.pl?\n";
}
$unicode_data .= ".gz" if not -e $unicode_data and -e "$unicode_data.gz";
chomp(my $unicode_description = <$unicode_fh>);
# This hash maps fields to positions, the array maps positions to fields
my (%unicode_field, @unicode_field);
{
chomp(my $fields = <$unicode_fh>);
$fields =~ s/^codepoint;//;
my $i = 0;
for (split /;/, $fields) {
$unicode_field[$i] = $_;
$unicode_field{$_} = $i++;
}
}
my ($unicode_fh, $unicode_description, %unicode_field, @unicode_field);
my (@char_cache, $char_cache_done);
sub char_info {
......@@ -225,7 +195,8 @@ my %opts = (format => 'x,c', details => -1);
my $error;
GetOptions(
'help|h|?' => \$opts{help},
'details|d!' => \$opts{details},
'unihan|H!' => \$opts{unihan},
'details|d!' => \$opts{details},
'list|l' => sub { $opts{details} = 0 },
'format|f=s' => \$opts{format},
'decimal|t' => sub { $opts{format} = 'd,c' },
......@@ -253,6 +224,34 @@ if ($opts{search} and $opts{regex}) {
warn "Both --search and --regex provided; --search will be ignored (all arguments treated as regex values)\n";
}
{
my $unicode_data = "$FindBin::RealBin/unicode" . ($opts{unihan} ? "" : "-nounihan") . ".data";
if (-e $unicode_data) {
open $unicode_fh, '<:utf8', "$FindBin::RealBin/unicode.data"
or die "Cannot locate or access unicode.data\n";
}
elsif (-e "$unicode_data.gz") {
require IO::Uncompress::Gunzip;
$unicode_fh = IO::Uncompress::Gunzip->new("$unicode_data.gz")
or die "Opening $unicode_data.gz failed: $IO::Uncompress::Gunzip::GunzipError\n";
$IO::Uncompress::Gunzip::GunzipError if 0; # Silence "only used once" warning
}
else {
die "Cannot find unicode.data(.gz) in $FindBin::RealBin; perhaps you need to run extract.pl?\n";
}
chomp($unicode_description = <$unicode_fh>);
chomp(my $fields = <$unicode_fh>);
$fields =~ s/^codepoint;//;
my $i = 0;
for (split /;/, $fields) {
$unicode_field[$i] = $_;
$unicode_field{$_} = $i++;
}
}
if ($opts{search} or $opts{regex}) {
die "Error: " . ($opts{search} ? "--search" : "--regex") . " requires pattern arguments\n"
if not @ARGV;
......@@ -511,6 +510,12 @@ Options:
Recognized options:
-h, -?, --help This help screen.
-H, --[no]unihan If enabled, load the unicode data file that contains
unihan characters; if disabled (the default), unihan
data is not loaded. Unihan data is required to
search for and display CJK Ideograph characters, but
substantially increase the size of the character
database, making the program much slower.
-s, --search Searches for characters with unicode names containing
all of the given strings. Search is case-insensitive.
-r, --regex Like --search, but uses the supplied perl regular
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment