使用libuchardet识别编码,再使用iconv转换源编码到utf-8
以下点需要注意:
获取源的编码是重中之重,iconv本身不会有什么问题
受输入源的限制,比如输入太少,特征不够,uchardet并不总是能识别到源的编码,或者是会认为是子集编码(GB18030之于GBK)。
这是在linux下测试的,win端编译iconv应该会麻烦一点,需要MYSYS环境或者github上找win工程版
1 2 3 4 5 wget https://ftp.gnu.org/gnu/libiconv/libiconv-1.17.tar.gz tar -xvf libiconv-1.17.tar.gz cd libiconv-1.17./configure --prefix=`pwd `/install --enable-static make install -j > make.log
头文件在 install 目录 编译结果静态库的路径有点奇怪,find -name libiconv.a
应该可以帮助你找到它,或者查看make.log
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 #include <fstream> #include <iconv.h> #include <iostream> #include <sstream> #include <stdio.h> #include <string> #include <string.h> #include <uchardet/uchardet.h> #include </home/lull/code/temp/libiconv-1.17/install/include/iconv.h> #define LOG printf static std::string require_utf8 (const char * filename) { std::ifstream in (filename) ; std::stringstream ss; in.seekg (0 , std::ios_base::end); size_t filesize = in.tellg (), outsize = 2 * filesize; in.seekg (0 , std::ios_base::beg); std::string inbuf, outbuf; inbuf.resize (filesize); outbuf.resize (outsize); auto pp =(char *)inbuf.data (); in.read (pp, filesize); auto uc_checker = uchardet_new (); int result = uchardet_handle_data (uc_checker, inbuf.data (), inbuf.size ()); uchardet_data_end (uc_checker); const char * charset = uchardet_get_charset (uc_checker); LOG ("source got %s\n" , charset); iconv_t iconver = iconv_open ("utf-8" , charset); char * _pindata = (char *)inbuf.data (); char * _poutdata = (char *)outbuf.data (); int bytes = iconv (iconver, &_pindata, &filesize, &_poutdata, &outsize); if (bytes) { return inbuf; } else { int outLen = strlen (outbuf.data ()); outbuf.resize (outLen); return outbuf; } } int main () { auto utf8str = require_utf8 ("gbk.log" ); #ifdef _WIN32 system ("chcp 65001" ); #endif LOG ("result:%s" , utf8str.c_str ()); return 0 ; }