Thursday, January 28, 2010

Perl UTF8 to DEC

This is reading xml loaded in @xmld, and returns the xml, with utf8 converted to dec.
For the systems which cannot store utf8 char sets.

foreach my $line (@xmld)
{
my $loopc=0;
while ($line=~/([\x{80}-\x{FFFF}])/ || $line=~/\d{3}\_=\_/){
$line=utf8todec($line);
$loopc++;
last if $loopc>4;
}

if ($line =~m/(\d{3,})\_\=\_/){
if (my @u_ar=($line=~m/\d{3,}\_=\_/g)){
foreach my $u_cs (@u_ar){
if (my $u_cs=~m/(\d{3,})\_\=\_/){
my $u_ch=$1;
$line=~s/$u_cs/&#$u_ch;/g;
}
}
}
}

if ($line ne "") {
if ( $jxmld !~/\s$/ && $line !~/.\s/ && $jxmld ne "" ) {
$jxmld .= " $line";
}else{
$jxmld .= $line;
}
}
}

sub utf8todec()
{
my $u_st=shift;
my @u_ar, $u_c1, $u_c2, $u_c3, $u_c4, $u_cs, $u_ch;

$u_st=~ s/([\x{80}-\x{FFFF}])/ord($1).'_=_'/gse;

if (@u_ar=($u_st=~m/\d{3}\_=\_\d{3}\_=\_\d{3}\_=\_/g)){
foreach $u_cs (@u_ar){
if ($u_cs=~m/(\d{3})\_\=\_(\d{3})\_\=\_(\d{3})\_\=\_/){
($u_c1, $u_c2, $u_c3)=($1,$2,$3);
if ($u_c1>=224&& $u_c1<=239){
$u_ch=($u_c1-224)*64*64+($u_c2-128)*64+($u_c3-128);
$u_st=~s/$u_cs/&#$u_ch;/g;
}
}
}
}

if (@u_ar=($u_st=~m/\d{3}\_=\_\d{3}\_=\_/g)){
foreach $u_cs (@u_ar){
if ($u_cs=~m/(\d{3})\_\=\_(\d{3})\_\=\_/){
($u_c1, $u_c2)=($1,$2);
if ($u_c1>=192&& $u_c1<=223){
$u_ch=($u_c1-192)*64+($u_c2-128);
$u_st=~s/$u_cs/&#$u_ch;/g;
}
}
}
}
return $u_st;
}

No comments:

Post a Comment