Real Multilanguage Support


Please note that Wikka 1.4 will be released with full UTF-8 support, and is currently available for testing at http://wush.net/svn/wikka/trunk

See also:
 

Here's some code to provide real multilanguage support.
The first 3 functions are used within the functions that do the real enconding conversions.
str2utf8, str2ascii and str2iso8859 can take any encodend string and convert it into the desired encoding: ascii plus unicode entities for html output, iso8859-1 plus unicode entities for database storage and utf8 for forms.
Unfortunately the ascii and iso8859 output is not compatible with htmlspecialchars. This is the reason of a valid_xml function. It has the same scope of htmlspecialchars , but will correctly handle &.
How to use this function? For istance,


And so on....

Update I changed the functions that do the conversion to improve speed and reduce memory usage 2004-08-14
--AndreaRossato

Check it out here.

The bits:
<?php
//Multilanguage support. We will use:  utf-8 for user input, iso8859-1 + unicode for database storage and ascii + unicode for printing  
function utf8_to_unicode($str) {
    $unicode = array();    
    $values = array();
    $lookingFor = 1;
    for ($i = 0; $i < strlen($str); $i++ ) {
        $thisValue = ord( $str[$i] );
        if ( $thisValue < 128 ) $unicode[] = $thisValue;
        else {
            if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
            $values[] = $thisValue;
            if ( count( $values ) == $lookingFor ) {
                $number = ( $lookingFor == 3 ) ?
                     ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
                    ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
                $unicode[] = $number;
                $values = array();
                $lookingFor = 1;
            }
        }
    }
    return $unicode;
}
function  deCP1252 ($str) {
    $str = str_replace("€", "€", $str);
    $str = str_replace("", "", $str);
    $str = str_replace("‚", "‚", $str);
    $str = str_replace("ƒ", "ƒ", $str);
    $str = str_replace("„", "„", $str);
    $str = str_replace("…", "…", $str);
    $str = str_replace("†", "†", $str);
    $str = str_replace("‡", "‡", $str);
    $str = str_replace("ˆ", "ˆ", $str);
    $str = str_replace("‰", "‰", $str);
    $str = str_replace("Š", "Š", $str);
    $str = str_replace("‹", "‹", $str);
    $str = str_replace("Œ", "Œ", $str);
    $str = str_replace("‘", "‘", $str);
    $str = str_replace("’", "’", $str);
    $str = str_replace("“", "“", $str);
    $str = str_replace("”", "”", $str);
    $str = str_replace("•", "•", $str);
    $str = str_replace("–", "-", $str);
    $str = str_replace("—", "—", $str);
    $str = str_replace("˜", "˜", $str);
    $str = str_replace("™", "™", $str);
    $str = str_replace("š", "š", $str);
    $str = str_replace("›", "›", $str);
    $str = str_replace("œ", "œ", $str);
    $str = str_replace("Ÿ", "Ÿ", $str);
    return $str;
}
function code2utf($num){
    if($num<128)return chr($num);
    if($num<2048)return chr(($num>>6)+192).chr(($num&63)+128);
    if($num<65536)return chr(($num>>12)+224).chr((($num>>6)&63)+128).chr(($num&63)+128);
    if($num<2097152)return chr(($num>>18)+240).chr((($num>>12)&63)+128).chr((($num>>6)&63)+128). chr(($num&63)+128);
    return '';
}
//to print in a form
function str2utf8($str) {
    mb_detect_order("ASCII, UTF-8, ISO-8859-1");
    if (mb_detect_encoding($str) == "UTF-8") { 
        preg_match_all("/&#([0-9]*?);/", $str, $unicode);
        foreach( $unicode[0] as $key => $value) {
            $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
        }
        return $str;       
    } else {
        $mystr = $str;
        $str = "";
        for ($i = 0; $i < strlen($mystr); $i++ ) {
            $code = ord( $mystr[$i] );
            if ($code >= 128 && $code < 160) {
                $str .= "&#".$code.";";
            } else {             
                $str .= $this->code2utf($code);
            }
        }
        $str = $this->deCP1252($str);
        preg_match_all("/&#([0-9]*?);/", $str, $unicode);
        foreach( $unicode[0] as $key => $value) {
            $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
        }
       
        return $str;
    }
}

//to print html
function str2ascii ($str) {
  mb_detect_order("ASCII, UTF-8, ISO-8859-1");
  $encoding = mb_detect_encoding($str);
  switch ($encoding) {
 
  case "UTF-8":
    preg_match_all("/&#([0-9]*?);/", $str, $unicode);
    foreach( $unicode[0] as $key => $value) {
      $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
    }
    $unicode = $this->utf8_to_unicode($str);
    $entities = '';
    foreach( $unicode as $value ) {
      $entities .= ( $value > 127 ) ? '&#' . $value . ';' : chr( $value );
    }
    return $this->deCP1252($entities);
    break;
   
  case "ISO-8859-1":
    for ($i = 0; $i < strlen($str); $i++ ) {
      $value = ord( $str{$i} );
      if ($value <= 127)
    $constr .= chr( $value );
      else $constr .= '&#' . $value . ';';
    }//for
   
    return $this->deCP1252($constr);
    break;
  case "ASCII":
    return $this->deCP1252($str);
    break;
  }
}

//for database storage
function str2iso8859 ($str) {
 mb_detect_order("ASCII, UTF-8, ISO-8859-1");
 $encoding = mb_detect_encoding($str);
 switch ($encoding) {
 
  case "UTF-8":
    preg_match_all("/&#([0-9]*?);/", $str, $unicode);
    foreach( $unicode[0] as $key => $value) {
      $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
    }
    $unicode = $this->utf8_to_unicode($str);
    $entities = '';
    foreach( $unicode as $value ) {
      if ($value <= 127)
    $entities .= chr( $value );
      elseif ($value > 159 && $value <= 255 )
    $entities .= chr( $value );
      else $entities .= '&#' . $value . ';';
    }
    return $this->deCP1252($entities);
    break;
   
  case "ISO-8859-1":
    for ($i = 0; $i < strlen($str); $i++ ) {
      $value = ord( $str{$i} );
      if ($value > 127 && $value <= 160 )
    $constr .= chr( $value );
      else $constr .= '&#' . $value . ';';
    }//for
   
    return $this->deCP1252($constr);
    break;
 case "ASCII":
   for ($i = 0; $i < strlen($str); $i++ ) {
     $value = ord( $str{$i} );
     if ($value > 159 && $value <= 255 )
       $constr .= chr( $value );
     elseif ($value > 127 && $value <= 160 )
       $constr .= '&#' . $value . ';';
     else $constr .= chr( $value );
   }return $this->deCP1252($str);
   break;
 }
      }


function valid_xml ($str) {
    $str = str_replace("\"", """, $str);  
    $str = str_replace("<", "<", $str); 
    $str = str_replace(">", ">", $str); 
    $str = preg_replace("/&(?![a-zA-Z0-9#]+?;)/", "&", $str);
    return $str;
}
?>

--AndreaRossato

hmm... i may have the solution but i need to understand the problem ;)

first i don't know why not to take the utf8-decode and utf8-encode functions to handle the conversion itself (but maybe there is a reason i didn't think about) (the correct functions would have been http_entity_decode($string, ENT_QUOTES, 'UTF-8') and httpentities($string, ENT_QUOTES, 'UTF-8'), but these functions aren't able to handle multybyte-chars yet. the mb-string-lib might give a more straight and performant solution. andrea's sample code should be valuable to understand what happens but i am still looking for a variant that don't "contaminate" the code too much and keeps it maintainable. a good start might be to introduce two functions "Formstring()" and "DBstring()" which do all conversion stuff including mysql_escape_string and such and to maintain the conversion stuff in one central place in future steps)

second it's not perfectly clear to me, how to treat clients that don't accept utf-8 encoding. i haven't had much time to get into the stuff, but so far i think the following tasks have to be managed:


what i don't understand yet is:


btw: what wakka-forks already exist, that are redesigned for the needs of a foreign charset? isn't wackowiki a russian spin off? do we have some cyrillic speaking wikka-fans out there? ;)

-- dreckfehler


There's a Wakka fork redesigned to support multi-language: UniWakka -:).

I'll try to clarify the problem, as far as I can ;)
The problem with character encoding is that UTF-8 is a multi-byte encoding. Ascii and UTF-8 are actually the same stuff, since the first 128 character in UTF-8 are plain 8-bit. The problem is the remaining characters that are encoded with more than 1 byte...

Now, there are two different approaches:
1. you can use 8-bit encoding (iso-8859-*). That is to say: if you have cyrillic characters you can use iso-8859-5 (or cp-1252, as far as I remember). Ascii characters are the same, bur above chr(128) you have cyrillic chars. In this case you can use cyrillic but not, for instance, french accented letters (these are not included in iso-8859-5).
This approach lets you use charset metatags to define the encoding. PHP will be able to handle it, since the characters are plain 8-bit. This cannot be called multi-language support: you can only use a very limited set of languages at a time. Period.
This is the Wacko approach.

2. If you want to have cyrillic letters and Italian (or French) accented letters in the same wiki, then you need UTF-8, that is to say, multi-byte characters. PHP will not able to handle strings with multi-byte encodings: preg_match, preg_replace will not work.
You need to convert those strings into single-byte characters. The only way I was able to find to manipulate those strings is to use iso-8859-1 plus unicode entities.

WikiWords must be plain ascii, as every URI.
I did not study WikkaWiki diff engine. But there shouldn't be any problem as far as you use unicode entities above ascii (or iso-8859-1) characters.
The same applies to full-text search. The string to be searched is converted into iso-8859-1 plus unicode entities. And unicode entities can be searched. Have a try here.
http_entity_decode and httpentities work only with single byte characters, as every php functions. As you said, for multi-byte you need to use mb-string-lib. But if you want to use the lib you are going to rewrite every wakka-derived wiki, and you cannot use perl regular expressions. And this is not going to avoid "contamination" of the code.

Moreover, I would like to ask you to indicate some user-agent that do not support UTF-8. IE, gecko derived browser, Konqueror, Opera do support it. As far as I know Google pages are utf-8 encoded.

--AndreaRossato

"Modern" user agents support UTF-8 - but as far as I know only the graphical ones (i.e., not Lynx or Links - or maybe they do on Unix, but certainly not on Windows); IE at least as far back as 5.01 - don't know about 4.0 (yes there are people that use this); Netscape 4.x has I think only limited support (if at all), and as you say the Gecko-based browsers are OK, as is Opera (6+ at least, not sure about 5).
-- JavaWoman



The "We don't like mbstring" version of the code

<?php
function is_utf8($Str) {
  for ($i=0; $i<strlen($Str); $i++) {
    if (ord($Str[$i]) < 0x80) continue;
    elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1;
    elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2;
    elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3;
    elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4;
    elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5;
    else return false;
    for ($j=0; $j<$n; $j++) {
      if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
    return false;
    }
  }
  return true;
}

//to print in a form
function str2utf8($str) {
  if ($this->is_utf8($str)) {  
    preg_match_all("/&#([0-9]*?);/", $str, $unicode);
    foreach( $unicode[0] as $key => $value) {
      $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
    }
    return $str;       
  } else {
    $mystr = $str;
    $str = "";
    for ($i = 0; $i < strlen($mystr); $i++ ) {
      $code = ord( $mystr[$i] );
      if ($code >= 128 && $code < 160) {
    $str .= "&#".$code.";";
      } else {           
    $str .= $this->code2utf($code);
      }
    }
    $str = $this->deCP1252($str);
    preg_match_all("/&#([0-9]*?);/", $str, $unicode);
    foreach( $unicode[0] as $key => $value) {
                $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
    }
   
    return $str;
  }
}

//ascii for xhtml
function str2ascii ($str) {
  if ($this->is_utf8($str)) {
   
    preg_match_all("/&#([0-9]*?);/", $str, $unicode);
    foreach( $unicode[0] as $key => $value) {
      $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
    }
    $unicode = $this->utf8_to_unicode($str);
    $entities = '';
    foreach( $unicode as $value ) {
          $entities .= ( $value > 127 ) ? '&#' . $value . ';' : chr( $value );
    }
    return $this->deCP1252($entities);
  } else {     
    for ($i = 0; $i < strlen($str); $i++ ) {
      $value = ord( $str{$i} );
      if ($value <= 127)
    $constr .= chr( $value );
      else $constr .= '&#' . $value . ';';
    }//for
   
    return $this->deCP1252($constr);
   
  }
}
//iso8859 for database storage (so we do not need mysql 4.1)
function str2iso8859 ($str) {
  if ($this->is_utf8($str)) {
    preg_match_all("/&#([0-9]*?);/", $str, $unicode);
    foreach( $unicode[0] as $key => $value) {
      $str = preg_replace("/".$value."/", $this->code2utf($unicode[1][$key]), $str);
    }
    $unicode = $this->utf8_to_unicode($str);
    $entities = '';
    foreach( $unicode as $value ) {
      if ($value <= 127)
    $entities .= chr( $value );
      elseif ($value > 159 && $value <= 255 )
    $entities .= chr( $value );
      else $entities .= '&#' . $value . ';';
    }
    return $this->deCP1252($entities);
  } else {     
    for ($i = 0; $i < strlen($str); $i++ ) {
      $value = ord( $str{$i} );
      if ($value > 159 && $value <= 255 )
    $constr .= chr( $value );
      elseif ($value > 127 && $value <= 160 )
    $constr .= '&#' . $value . ';';
      else $constr .= chr( $value );
    }
    return $this->deCP1252($str);
   
  }
}

--AndreaRossato



The Absolute Minimum Every Software Developer Must Know About Unicode and Character Sets



CategoryDevelopmentI18n
There are 23 comments on this page. [Show comments]
Valid XHTML :: Valid CSS: :: Powered by WikkaWiki