Revision [832]

This is an old revision of HandlingUTF8 made by DreckFehler on 2004-07-31 15:10:23.

 

Real Multilanguage Support


Here's some code to provide real multilanguage support.
The first 3 functions are used within the functions that do the real enconding conversions.
str2utf8, str2ascii and str2iso8859 can take any encodend string and convert it into the desired encoding: ascii plus unicode entities for html output, iso8859-1 plus unicode entities for database storage and utf8 for forms.
Unfortunately the ascii and iso8859 output is not compatible with htmlspecialchars. This is the reason of a valid_xml function. It has the same scope of htmlspecialchars , but will correctly handle &.
How to use this function? For istance,


And so on....

Check it out here.

The bits:
<?php
//Multilanguage support. We will use:  utf-8 for user input, iso8859-1 + unicode for database storage and ascii + unicode for printing  
function utf8_to_unicode($str) {
    $unicode = array();    
    $values = array();
    $lookingFor = 1;
    for ($i = 0; $i < strlen($str); $i++ ) {
        $thisValue = ord( $str[$i] );
        if ( $thisValue < 128 ) $unicode[] = $thisValue;
        else {
            if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
            $values[] = $thisValue;
            if ( count( $values ) == $lookingFor ) {
                $number = ( $lookingFor == 3 ) ?
                     ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
                    ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
                $unicode[] = $number;
                $values = array();
                $lookingFor = 1;
            }
        }
    }
    return $unicode;
}
function  deCP1252 ($str) {
    $str = str_replace("&#128", "&#8364;", $str);
    $str = str_replace("&#129", "", $str);
    $str = str_replace("&#130;", "&#8218;", $str);
    $str = str_replace("&#131;", "&#402;", $str);
    $str = str_replace("&#132;", "&#8222;", $str);
    $str = str_replace("&#133;", "&#8230;", $str);
    $str = str_replace("&#134;", "&#8224;", $str);
    $str = str_replace("&#135;", "&#8225;", $str);
    $str = str_replace("&#136;", "&#710;", $str);
    $str = str_replace("&#137;", "&#8240;", $str);
    $str = str_replace("&#138;", "&#352;", $str);
    $str = str_replace("&#139;", "&#8249;", $str);
    $str = str_replace("&#140;", "&#338;", $str);
    $str = str_replace("&#145;", "&#8216;", $str);
    $str = str_replace("&#146;", "&#8217;", $str);
    $str = str_replace("&#147;", "&#8220;", $str);
    $str = str_replace("&#148;", "&#8221;", $str);
    $str = str_replace("&#149;", "&#8226;", $str);
    $str = str_replace("&#150;", "&#8211;", $str);
    $str = str_replace("&#151;", "&#8212;", $str);
    $str = str_replace("&#152;", "&#732;", $str);
    $str = str_replace("&#153;", "&#8482;", $str);
    $str = str_replace("&#154;", "&#353;", $str);
    $str = str_replace("&#155;", "&#8250;", $str);
    $str = str_replace("&#156;", "&#339;", $str);
    $str = str_replace("&#159;", "&#376;", $str);
    return $str;
}
function code2utf($num){
    if($num<128)return chr($num);
    if($num<2048)return chr(($num>>6)+192).chr(($num&63)+128);
    if($num<65536)return chr(($num>>12)+224).chr((($num>>6)&63)+128).chr(($num&63)+128);
    if($num<2097152)return chr(($num>>18)+240).chr((($num>>12)&63)+128).chr((($num>>6)&63)+128). chr(($num&63)+128);
    return '';
}
//to print in a form
function str2utf8($str) {
    mb_detect_order("ASCII, UTF-8, ISO-8859-1");
    if (mb_detect_encoding($str) == "UTF-8") { 
        preg_match_all("/&#([0-9]*?);/", $str, $unicode);
        foreach( $unicode[0] as $key => $value) {
            $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
        }
        return $str;       
    } else {
        $mystr = $str;
        $str = "";
        for ($i = 0; $i < strlen($mystr); $i++ ) {
            $code = ord( $mystr[$i] );
            if ($code >= 128 && $code < 160) {
                $str .= "&#".$code.";";
            } else {             
                $str .= $this->code2utf($code);
            }
        }
        $str = $this->deCP1252($str);
        preg_match_all("/&#([0-9]*?);/", $str, $unicode);
        foreach( $unicode[0] as $key => $value) {
            $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
        }
       
        return $str;
    }
}

//to print html
function str2ascii ($str) {
    $str = $this->str2utf8($str);
    $unicode = $this->utf8_to_unicode($str);
    $entities = '';
    foreach( $unicode as $value ) {
        $entities .= ( $value > 127 ) ? '&#' . $value . ';' : chr( $value );
    } //foreach
    return $this->deCP1252($entities);
    }
//for database storage
function str2iso8859 ($str) {
    $str = $this->str2utf8($str);
    $unicode = $this->utf8_to_unicode($str);
    $entities = "";
    foreach( $unicode as $value ) {
        if ($value <= 127)
            $entities .= chr( $value );
        elseif ($value > 159 && $value <= 255 )
            $entities .= chr( $value );
        else $entities .= '&#' . $value . ';';
    } //foreach
    return $this->deCP1252($entities);
}
function valid_xml ($str) {
    $str = str_replace("\"", "&quot;", $str);  
    $str = str_replace("<", "&lt;", $str); 
    $str = str_replace(">", "&gt;", $str); 
    $str = preg_replace("/&(?![a-zA-Z0-9#]+?;)/", "&amp;", $str);
    return $str;
}
?>

--AndreaRossato

hmm... i may have the solution but i need to understand the problem ;)

first i don't know why not to take the utf8-decode and utf8-encode functions to handle the conversion itself (but maybe there is a reason i didn't think about) (the correct functions would have been http_entity_decode($string, ENT_QUOTES, 'UTF-8') and httpentities($string, ENT_QUOTES, 'UTF-8'), but these functions aren't able to handle multybyte-chars yet. the mb-string-lib might give a more straight and performant solution. andrea's sample code should be valuable to understand what happens but i am still looking for a variant that don't "contaminate" the code too much and keeps it maintainable. a good start might be to introduce two functions "Formstring()" and "DBstring()" which do all conversion stuff including mysql_escape_string and such and to maintain the conversion stuff in one central place in future steps)

second it's not perfectly clear to me, how to treat clients that don't accept utf-8 encoding. i haven't had much time to get into the stuff, but so far i think the following tasks have to be managed:


what i don't understand yet is:


btw: what wakka-forks already exist, that are redesigned for the needs of a foreign charset? isn't wackowiki a russian spin off? do we have some cyrillic speaking wikka-fans out there? ;)

-- dreckfehler
There are 23 comments on this page. [Show comments]
Valid XHTML :: Valid CSS: :: Powered by WikkaWiki