Fork me on GitHub

AS3从未知编码的二进制流中自适应编码读取文本

原创文章,未经允许,请勿转载

给你一个文件或者二进制,不知道编码是gb2312还是utf8的情况下怎么正确读取出文本呢?

以下代码是as3的,其他编程语言只要稍微改动一下即可。

上代码,请直接用!

感谢C++ BLOG提供的判断utf8的方法://hi.baidu.com/xingyan126/item/4abec1c1c2143755bcef6956

/** 
 * 从未知编码的二进制流中读取文本 
 * @param   ba 
 * @param   len 读取长度,默认为-1,则读取至文件尾 
 * @return 
 */ 
public static function readString(ba:ByteArray,len:int = -1):String 
{ 
    if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable; 
    var encode:String = 'gb2312'; 
    //先判断头三个字节是不是utf bom 
    if (ba.bytesAvailable >= 3) 
    { 
        //0xEF 0xBB 0xBF 
        var chkarr:Array = []; 
        var a:int = 0xffffffEF; 
        var b:int = 0xffffffBB; 
        var c:int = 0xffffffBF; 
        chkarr.push(ba.readByte()); 
        chkarr.push(ba.readByte()); 
        chkarr.push(ba.readByte()); 
        if ((chkarr[0] == a && chkarr[1] == b && chkarr[2] == c)) 
        { 
            //utf-8 bom 
            encode = 'utf-8'; 
            return ba.readMultiByte(len - 3, encode); 
        } 
        else 
        { 
            ba.position -= 3; 
        } 
    } 

    //逐个字节判断是否有UTF8的编码 
    if (isUTF8(ba, len)) 
    { 
        encode = 'utf-8'; 
    } 

    return ba.readMultiByte(len - 3, encode); 
} 

/** 
 * 判断文本是否是UTF8编码 
 * @param   ba 
 * @param   len 读取长度,默认为-1,则读取至文件尾 
 * @return 
 */ 
public static function isUTF8(ba:ByteArray,len:int = -1):Boolean 
{ 
    if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable; 
    var score:int = 0; 
    var i:int; 
    var goodbytes:int = 0, asciibytes:int = 0; 
    // Maybe also use UTF8 Byte Order Mark: EF BB BF 
    // Check to see if characters fit into acceptable ranges 
    var oldpos:int = ba.position; 
    var byte:int, byte1:int, byte2:int; 
    var curlen:int = len; 
    while(curlen>0) 
    { 
        ba.position = oldpos + (len - curlen); 
        byte = ba.readByte(); 
        curlen -= 1; 
        if (curlen >= 1) byte1 = ba.readByte(); 
        if (curlen >= 2) byte2 = ba.readByte(); 

        //0x7f = 127 = 01111111 
        if ((byte & 0x7F) == byte)  
        {  
             // 最高位是0的ASCII字符 
             asciibytes++; 
             // Ignore ASCII, can throw off count 
        }  
        else if (-64 <= byte && byte <= -33 
             //-0x40~-0x21 
             && // Two bytes 
             curlen >= 1 && -128 <= byte1 
             &&  
             byte1<= -65)  
        { 
             goodbytes += 2; 
             curlen -= 1; 
        }  
        else if (-32 <= byte 
            && byte <= -17 
            && // Three bytes 
            curlen >= 2 && -128 <= byte1 
            && byte1 <= -65 && -128 <= byte2 
            && byte2 <= -65)  
        { 
            goodbytes += 3; 
            curlen -= 2; 
        } 
    } 

    ba.position = oldpos; 

    if (asciibytes == len)  
    { 
        return false; 
    } 
    score = 100 * goodbytes / (len - asciibytes); 
    // If not above 98, reduce to zero to prevent coincidental matches 
    // Allows for some (few) bad formed sequences 
    if (score > 98) { 
        return true; 
    } else if (score > 95 && goodbytes > 30) { 
        return true; 
    } else { 
        return false; 
    } 
}

来源:悠游悠游,原文地址:https://yymmss.com/p/207.html