Revision [796]

This is an old revision of HandlingUTF8 made by DreckFehler on 2004-07-29 12:44:44.

 

Real Multilanguage Support


Here's some code to provide real multilanguage support.
The first 3 functions are used within the functions that do the real enconding conversions.
str2utf8, str2ascii and str2iso8859 can take any encodend string and convert it into the desired encoding: ascii plus unicode entities for html output, iso8859-1 plus unicode entities for database storage and utf8 for forms.
Unfortunately the ascii and iso8859 output is not compatible with htmlspecialchars. This is the reason of a valid_xml function. It has the same scope of htmlspecialchars , but will correctly handle &.
How to use this function? For istance,


And so on....

Check it out here.

The bits:
<?php
//Multilanguage support. We will use:  utf-8 for user input, iso8859-1 + unicode for database storage and ascii + unicode for printing  
function utf8_to_unicode($str) {
    $unicode = array();    
    $values = array();
    $lookingFor = 1;
    for ($i = 0; $i < strlen($str); $i++ ) {
        $thisValue = ord( $str[$i] );
        if ( $thisValue < 128 ) $unicode[] = $thisValue;
        else {
            if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
            $values[] = $thisValue;
            if ( count( $values ) == $lookingFor ) {
                $number = ( $lookingFor == 3 ) ?
                     ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
                    ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
                $unicode[] = $number;
                $values = array();
                $lookingFor = 1;
            }
        }
    }
    return $unicode;
}
function  deCP1252 ($str) {
    $str = str_replace("&#128", "&#8364;", $str);
    $str = str_replace("&#129", "", $str);
    $str = str_replace("&#130;", "&#8218;", $str);
    $str = str_replace("&#131;", "&#402;", $str);
    $str = str_replace("&#132;", "&#8222;", $str);
    $str = str_replace("&#133;", "&#8230;", $str);
    $str = str_replace("&#134;", "&#8224;", $str);
    $str = str_replace("&#135;", "&#8225;", $str);
    $str = str_replace("&#136;", "&#710;", $str);
    $str = str_replace("&#137;", "&#8240;", $str);
    $str = str_replace("&#138;", "&#352;", $str);
    $str = str_replace("&#139;", "&#8249;", $str);
    $str = str_replace("&#140;", "&#338;", $str);
    $str = str_replace("&#145;", "&#8216;", $str);
    $str = str_replace("&#146;", "&#8217;", $str);
    $str = str_replace("&#147;", "&#8220;", $str);
    $str = str_replace("&#148;", "&#8221;", $str);
    $str = str_replace("&#149;", "&#8226;", $str);
    $str = str_replace("&#150;", "&#8211;", $str);
    $str = str_replace("&#151;", "&#8212;", $str);
    $str = str_replace("&#152;", "&#732;", $str);
    $str = str_replace("&#153;", "&#8482;", $str);
    $str = str_replace("&#154;", "&#353;", $str);
    $str = str_replace("&#155;", "&#8250;", $str);
    $str = str_replace("&#156;", "&#339;", $str);
    $str = str_replace("&#159;", "&#376;", $str);
    return $str;
}
function code2utf($num){
    if($num<128)return chr($num);
    if($num<2048)return chr(($num>>6)+192).chr(($num&63)+128);
    if($num<65536)return chr(($num>>12)+224).chr((($num>>6)&63)+128).chr(($num&63)+128);
    if($num<2097152)return chr(($num>>18)+240).chr((($num>>12)&63)+128).chr((($num>>6)&63)+128). chr(($num&63)+128);
    return '';
}
//to print in a form
function str2utf8($str) {
    mb_detect_order("ASCII, UTF-8, ISO-8859-1");
    if (mb_detect_encoding($str) == "UTF-8") { 
        preg_match_all("/&#([0-9]*?);/", $str, $unicode);
        foreach( $unicode[0] as $key => $value) {
            $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
        }
        return $str;       
    } else {
        $mystr = $str;
        $str = "";
        for ($i = 0; $i < strlen($mystr); $i++ ) {
            $code = ord( $mystr[$i] );
            if ($code >= 128 && $code < 160) {
                $str .= "&#".$code.";";
            } else {             
                $str .= $this->code2utf($code);
            }
        }
        $str = $this->deCP1252($str);
        preg_match_all("/&#([0-9]*?);/", $str, $unicode);
        foreach( $unicode[0] as $key => $value) {
            $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
        }
       
        return $str;
    }
}

//to print html
function str2ascii ($str) {
    $str = $this->str2utf8($str);
    $unicode = $this->utf8_to_unicode($str);
    $entities = '';
    foreach( $unicode as $value ) {
        $entities .= ( $value > 127 ) ? '&#' . $value . ';' : chr( $value );
    } //foreach
    return $this->deCP1252($entities);
    }
//for database storage
function str2iso8859 ($str) {
    $str = $this->str2utf8($str);
    $unicode = $this->utf8_to_unicode($str);
    $entities = "";
    foreach( $unicode as $value ) {
        if ($value <= 127)
            $entities .= chr( $value );
        elseif ($value > 159 && $value <= 255 )
            $entities .= chr( $value );
        else $entities .= '&#' . $value . ';';
    } //foreach
    return $this->deCP1252($entities);
}
function valid_xml ($str) {
    $str = str_replace("\"", "&quot;", $str);  
    $str = str_replace("<", "&lt;", $str); 
    $str = str_replace(">", "&gt;", $str); 
    $str = preg_replace("/&(?![a-zA-Z0-9#]+?;)/", "&amp;", $str);
    return $str;
}
?>

--AndreaRossato

hmm... i may have the solution but i need to understand the problem ;)

first i don't know why not to take the utf8-decode and utf8-encode functions to handle the conversion itself (but maybe there is a reason i didn't think about). second it's not perfectly clear to me, how to treat clients that don't accept utf-8 encoding. i haven't had much time to get into the stuff, but so far i think the following tasks have to be managed:


what i don't understand yet is:


btw: what wakka-forks already exist, that are redesigned for the needs of a foreign charset? isn't wackowiki a russian spin off? do we have some cyrillic speaking wikka-fans out there? ;)

-- dreckfehler
There are 23 comments on this page. [Show comments]
Valid XHTML :: Valid CSS: :: Powered by WikkaWiki