#!/usr/bin/perl -w
use strict;
use Encode;

############################################################################
# code page data builder, by magnum / JimF.   v1.01
# coded July 27-29, 2011, as a tool to build codepage encoding data needed
# for John the Ripper code page conversions.  The data output from this file 
# is made to be directly placed into the ./src/encoding_data.h file in john's
# source tree.
# USAGE:  cmpt_cp.pl CODEPAGE
# cmpt_cp.pl run without any arguments will show a list of possible code pages.
############################################################################

my $enc;
if (@ARGV==1) {$enc=$ARGV[0];}
else {
	print "Supported encodings:\n", join(", ", Encode->encodings(":all")), "\n\n";
	exit(0);
}

my $to_unicode_high128=""; my $lower=""; my $upper=""; my $lowonly=""; my $uponly="";
my $encu = uc($enc);my $hs = "";
$encu =~ s/-/_/g;
################################# 
# first step, compute the unicode array
################################# 
foreach my $i (0x80..0xFF) {
	my $u = chr($i);
	Encode::from_to($u, $enc, "utf8");
	$hs .= $u;
	$to_unicode_high128 .= "0x" . sprintf "%04X", ord Encode::decode("UTF-8", $u);
	if ($i % 16 == 15 && $i != 255) { $to_unicode_high128 .= ",\n"; }
	elsif ($i != 255) { $to_unicode_high128 .= ","; }
}
print "\n// ", $hs, "\n";
print "\n// here is the $encu to Unicode conversion for $encu characters from 0x80 to 0xFF\n";
print "static UTF16 ".$encu."_to_unicode_high128[] = {\n";
print $to_unicode_high128 . " };\n";

################################# 
# Now build upcase/downcase data.
################################# 
foreach my $i (0x80..0xFF) {
	my $c = chr($i);
	# converts $c into utf8, from $enc code page, and 'sets' the 'flag' in perl that $c IS a utf8 char.
	$c = Encode::decode($enc, $c);
	# upcase and low case the utf8 chars
	my $clc = lc $c; my $ulc = uc $c;
	# reconvert the utf8 char's back into $enc code page.
	$clc = Encode::encode($enc, $clc); $ulc = Encode::encode($enc, $ulc);
	if (chr($i) eq $clc && $clc ne $ulc) { 
		if (chr($i) ne $clc && chr($i) ne $ulc) {
			printf("// *** WARNING, char at ord(0x%X) U+%04X needs to be looked into. Neither conversion gets back to original value!\n",$i,ord($c));
		} elsif ( ord($ulc) == 0x3F) { # 0x3F is ?  In perl, if there IS a unicode conversion for a case, but there is no 'reverse' case char in the charset, 
			$lowonly .= sprintf("\\x%02X", ord($clc));
		} elsif ( ord($clc) == 0x3F) { # then the final encode gives us a '?'. Thus, we know this is a character, and know it's case, but we can not 'convert' it to the other case.
			$uponly .= sprintf("\\x%02X", ord($ulc));
		} elsif ( ord($clc) < 0x80 || ord($ulc) < 0x80) {
			printf("// *** WARNING, char at ord(0x%X) U+%04X needs to be looked into.  Likely multi-byte conversion\n",$i,ord($c));
		} else {
			$lower .= sprintf("\\x%02X", ord($clc));
			$upper .= sprintf("\\x%02X", ord($ulc));
		}
	}
}
print "#define CHARS_LOWER_".$encu." "; if (length($lower)>80) {print"\\\n\t";} print "\"".$lower."\"\n"; 
print "#define CHARS_UPPER_".$encu." "; if (length($upper)>80) {print"\\\n\t";} print "\"".$upper."\"\n";
if (length($lowonly)>0) {
	print "#define CHARS_LOW_ONLY_".$encu." "; if (length($lowonly)>80) {print"\\\n\t";} print "\"".$lowonly."\"\n";
}
if (length($uponly)>0) {
	print "#define CHARS_UP_ONLY_".$encu." "; if (length($uponly)>80) {print"\\\n\t";} print "\"".$uponly."\"\n";
}

################################# 
# Ok, provide a check to see if any of the characters UNDER 0x80
# are non-standard.  At this time, there is no plan on HOW to handle
# this within john.  The information is simply listed at this time.
################################# 
foreach my $i (0x00..0x7F) {
	my $u = chr($i);
	Encode::from_to($u, $enc, "utf8");
	my $str = sprintf "%04X", ord Encode::decode("UTF-8", $u);
	if ( hex($str) != $i) { printf("WARNING, low character %X maps into Unicode 0x%s\n", $i, $str);}
}
