Saturday, May 22, 2004

 

Some notes on WideChar and UNICODE in VC

1.
WideCharToMultiByte
http://msdn.microsoft.com/library/default.asp?url=/library/en-us/intl/unicode_2bj9.asp
非常简单的解决了我遇到的中文乱码问题。

另外
TEXT vs. _TEXT vs. _T, and UNICODE vs. _UNICODE 的麻烦真的很麻烦!
看看
http://weblogs.asp.net/oldnewthing/archive/2004/02/12/71851.aspx
后面的评论就能很清楚的了解这一点。
感谢ForNormandy朋友做了以下这么个辛苦的比较工作。

by ForNormandy
from http://dev.csdn.net/article/33/33391.shtm

// 输出中文

char szA[8];
WCHAR szW[8];

sprintf(szA, "%s", L"和平"); // 乱码,四个字节
sprintf(szA, "%s", "和平"); // 和平

sprintf(szA, "%S", L"和平"); // 零字节
sprintf(szA, "%S", "和平"); // 零字节

swprintf(szW, L"%s", L"和平"); // 和平,四个字节
swprintf(szW, L"%s", "和平"); // 无法输出,四个字节,内容是ANSI码

swprintf(szW, L"%S", L"和平"); // 无法输出,八个字节,内容是Unicode码
swprintf(szW, L"%S", "和平"); // 无法输出,八个字节,内容是ANSI码


wsprintfA(szA, "%s", L"和平"); // 乱码,四个字节
wsprintfA(szA, "%s", "和平"); // 和平

wsprintfA(szA, "%S", L"和平"); // 和平
wsprintfA(szA, "%S", "和平"); // 乱码,两个字节

wsprintfW(szW, L"%s", L"和平"); // 和平,四个字节
wsprintfW(szW, L"%s", "和平"); // 无法输出,四个字节,内容是ANSI码

wsprintfW(szW, L"%S", L"和平"); // 无法输出,六个字节,内容是Unicode码
wsprintfW(szW, L"%S", "和平"); // 和平,八个字节

// 输出英文

char szA[8];
WCHAR szW[8];

sprintf(szA, "%s", L"well"); // w,一个字节
sprintf(szA, "%s", "well"); // well,四个字节

sprintf(szA, "%S", L"well"); // well,四个字节
sprintf(szA, "%S", "well"); // 零字节

swprintf(szW, L"%s", L"well"); // well,八个字节
swprintf(szW, L"%s", "well"); // 乱码,四个字节

swprintf(szW, L"%S", L"well"); // w,两个字节
swprintf(szW, L"%S", "well"); // well,八个字节


wsprintfA(szA, "%s", L"well"); // w,一个字节
wsprintfA(szA, "%s", "well"); // well,四个字节

wsprintfA(szA, "%S", L"well"); // well,四个字节
wsprintfA(szA, "%S", "well"); // 乱码,四个字节

wsprintfW(szW, L"%s", L"well"); // well,八个字节
wsprintfW(szW, L"%s", "well"); // 乱码,四个字节,内容是ANSI码

wsprintfW(szW, L"%S", L"well"); // w,两个字节
wsprintfW(szW, L"%S", "well"); // well,八个字节

2.
Unicode Home Page
http://www.unicode.org/

3.
International Components for Unicode
http://oss.software.ibm.com/icu/

ICU is a mature, widely used set of C/C++ and Java libraries for Unicode support, software internationalization and globalization (i18n/g11n). It grew out of the JDK 1.1 internationalization APIs, which the ICU team contributed, and the project continues to be developed for the most advanced Unicode/i18n support. ICU is widely portable and gives applications the same results on all platforms and between C/C++ and Java software.

4.
UTF-8 and Unicode FAQ for Unix/Linux
by Markus Kuhn
http://www.cl.cam.ac.uk/~mgk25/unicode.html

5.
Unicode, MBCS and Generic text mappings
By Chris Maunder
http://www.codeproject.com/cpp/unicode.asp
老大的作品,信誉保证

6.
/*#############################################################################
# TCONVERT.H
#
# SCA Software International S.A.
# http://www.scasoftware.com
# scaadmin@scasoftware.com
#
# Copyright (c) 2000 SCA Software International S.A.
#
# Date: 01.05.2000
# Author: Zoran M.Todorovic
#
# This software is provided "AS IS", without a warranty of any kind.
# You are free to use/modify this code but leave this header intact.
#
#############################################################################*/

#ifndef __TCONVERT_H__
#define __TCONVERT_H__

#ifndef _INC_TCHAR
# include < tchar.h>
#endif
#ifndef _INC_CRTDBG
# include < crtdbg.h>
#endif
#ifndef _WINDOWS_
# include < windows.h>
#endif

//=============================================================================
// class _tochar
// This class converts either WCHAR or CHAR string to a new CHAR string.
// Memory is allocated/deallocated using new/delete
//=============================================================================

class _tochar {
private:
BOOL m_bAutoDelete;
LPSTR m_szBuffer;

public:
_tochar(LPCWSTR wszText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(wszText);
int nLen = wcslen(wszText)+1;
m_szBuffer = new CHAR [nLen];
wcstombs(m_szBuffer, wszText, nLen);
}
_tochar(LPCSTR szText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(szText);
int nLen = strlen(szText) + 1;
m_szBuffer = new CHAR [nLen];
strcpy(m_szBuffer, szText);
}
~_tochar()
{
if (m_bAutoDelete) {
_ASSERTE(m_szBuffer);
delete [] m_szBuffer;
}
}
operator LPSTR()
{
_ASSERTE(m_szBuffer);
return (LPSTR)m_szBuffer;
}
operator LPCSTR()
{
_ASSERTE(m_szBuffer);
return (LPCSTR)m_szBuffer;
}
};

//=============================================================================
// class _towchar
// This class converts either WCHAR or CHAR string to a new WCHAR string.
// Memory is allocated/deallocated using new/delete
//=============================================================================

class _towchar {
private:
BOOL m_bAutoDelete;
LPWSTR m_wszBuffer;

public:
_towchar(LPCWSTR wszText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(wszText);
int nLen = wcslen(wszText)+1;
m_wszBuffer = new WCHAR [nLen];
wcscpy(m_wszBuffer, wszText);
}
_towchar(LPCSTR szText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(szText);
int nLen = strlen(szText) + 1;
m_wszBuffer = new WCHAR [nLen];
mbstowcs(m_wszBuffer, szText, nLen);
}
~_towchar()
{
if (m_bAutoDelete) {
_ASSERTE(m_wszBuffer);
delete [] m_wszBuffer;
}
}
operator LPWSTR()
{
_ASSERTE(m_wszBuffer);
return (LPWSTR)m_wszBuffer;
}
operator LPCWSTR()
{
_ASSERTE(m_wszBuffer);
return (LPCWSTR)m_wszBuffer;
}
};

//=============================================================================
// class _totchar
// This class converts a TCHAR string to a new TCHAR string.
// Memory is allocated/deallocated using new/delete
//=============================================================================

class _totchar {
private:
BOOL m_bAutoDelete;
LPTSTR m_tszBuffer;

public:
_totchar(LPCSTR szText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(szText);
int nLen = strlen(szText) + 1;
m_tszBuffer = new TCHAR [nLen];
#if defined(UNICODE) || defined(_UNICODE)
mbstowcs(m_tszBuffer, szText, nLen);
#else
strcpy(m_tszBuffer, szText);
#endif
}
_totchar(LPCWSTR wszText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(wszText);
int nLen = wcslen(wszText) + 1;
m_tszBuffer = new TCHAR [nLen];
#if defined(UNICODE) || defined(_UNICODE)
wcscpy(m_tszBuffer, wszText);
#else
wcstombs(m_tszBuffer, wszText, nLen);
#endif
}
~_totchar()
{
if (m_bAutoDelete) {
_ASSERTE(m_tszBuffer);
delete [] m_tszBuffer;
}
}
operator LPTSTR()
{
_ASSERTE(m_tszBuffer);
return (LPTSTR) m_tszBuffer;
}
operator LPCTSTR()
{
_ASSERTE(m_tszBuffer);
return (LPCTSTR) m_tszBuffer;
}
};

//=============================================================================
// class _cochar
// This class converts either WCHAR or CHAR string to a new CHAR string.
// Memory is allocated/deallocated using CoTaskMemAlloc/CoTaskMemFree.
//=============================================================================

class _cochar {
private:
BOOL m_bAutoDelete;
LPSTR m_szBuffer;

public:
_cochar(LPCWSTR wszText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(wszText);
int nLen = wcslen(wszText)+1;
m_szBuffer = (LPSTR)::CoTaskMemAlloc(nLen * sizeof(CHAR));
wcstombs(m_szBuffer, wszText, nLen);
}
_cochar(LPCSTR szText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(szText);
int nLen = strlen(szText) + 1;
m_szBuffer = (LPSTR)::CoTaskMemAlloc(nLen * sizeof(CHAR));
strcpy(m_szBuffer, szText);
}
~_cochar()
{
if (m_bAutoDelete)
::CoTaskMemFree(m_szBuffer);
}
operator LPSTR()
{
return (LPSTR)m_szBuffer;
}
operator LPCSTR()
{
return (LPCSTR)m_szBuffer;
}
};

//=============================================================================
// class _towchar
// This class converts either WCHAR or CHAR string to a new WCHAR string.
// Memory is allocated/deallocated using CoTaskMemAlloc/CoTaskMemFree
//=============================================================================

class _cowchar {
private:
BOOL m_bAutoDelete;
LPWSTR m_wszBuffer;

public:
_cowchar(LPCWSTR wszText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(wszText);
int nLen = wcslen(wszText)+1;
m_wszBuffer = (LPWSTR)::CoTaskMemAlloc(nLen * sizeof(WCHAR));
wcscpy(m_wszBuffer, wszText);
}
_cowchar(LPCSTR szText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(szText);
int nLen = strlen(szText) + 1;
m_wszBuffer = (LPWSTR)::CoTaskMemAlloc(nLen * sizeof (WCHAR));
mbstowcs(m_wszBuffer, szText, nLen);
}
~_cowchar()
{
if (m_bAutoDelete)
::CoTaskMemFree(m_wszBuffer);
}
operator LPWSTR()
{
return (LPWSTR)m_wszBuffer;
}
operator LPCWSTR()
{
return (LPCWSTR)m_wszBuffer;
}
};

//=============================================================================
// class _cotchar
// This class converts a TCHAR string to a new TCHAR string.
// Memory is allocated/deallocated using CoTaskMemAlloc/CoTaskMemFree
//=============================================================================

class _cotchar {
private:
BOOL m_bAutoDelete;
LPTSTR m_tszBuffer;

public:
_cotchar(LPCSTR szText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(szText);
int nLen = strlen(szText) + 1;
m_tszBuffer = (LPTSTR)::CoTaskMemAlloc(nLen * sizeof(TCHAR));
#if defined(UNICODE) || defined(_UNICODE)
mbstowcs(m_tszBuffer, szText, nLen);
#else
strcpy(m_tszBuffer, szText);
#endif
}
_cotchar(LPCWSTR wszText, BOOL bAutoDelete = TRUE)
{
m_bAutoDelete = bAutoDelete;
_ASSERTE(wszText);
int nLen = wcslen(wszText) + 1;
m_tszBuffer = (LPTSTR)::CoTaskMemAlloc(nLen * sizeof(TCHAR));
#if defined(UNICODE) || defined(_UNICODE)
wcscpy(m_tszBuffer, wszText);
#else
wcstombs(m_tszBuffer, wszText, nLen);
#endif
}
~_cotchar()
{
if (m_bAutoDelete)
::CoTaskMemFree(m_tszBuffer);
}
operator LPTSTR()
{
return (LPTSTR) m_tszBuffer;
}
operator LPCTSTR()
{
return (LPCTSTR) m_tszBuffer;
}
};

#endif

/*#############################################################################
# End of file
#############################################################################*/

7.
弥补Reflector对中文支持的不足
from http://www.cnblogs.com/birdshome/archive/2004/08/31/37926.aspx

Reflector是我目前用的最多的反编译工具,他可能不时最好的,但以它的更新速度,我相信他会成为最好的哦~~
国外的软件,对中文支持还是有问题,虽然我们都Unicode了,可是老外还是不一定认账。Reflector反编译代码,Unicode中非英文的字符都显示为了\u????,这本来还是对的,可是我们看起来就太郁闷了。Reflector支持Plug-in,曾经想写个插件,结果要了解的东西太多,就放弃了。不过问题也解决了,用VS.NET带的宏,haha

VS.NET 宏代码,名称:Unicode2Character


Imports EnvDTE
Imports System.Globalization
Imports System.Text.RegularExpressions
Imports System.Diagnostics

Public Module BirdshomeModule Birdshome

Sub Unicode2Character()Sub Unicode2Character()
Dim doc As Document = DTE.ActiveDocument
Dim docText As TextDocument = doc.Object
Dim selText As TextSelection = docText.Selection()
selText.SelectAll()
Dim text As String = selText.Text
Dim iLength As Integer
Do
iLength = text.Length
Dim m As Match
Dim strPattern As String = "(?< code>\u[A-F0-9]{4})"
m = Regex.Match(text, strPattern, RegexOptions.IgnoreCase)
If m.Success Then
Dim strValue As String
strValue = m.Groups("code").Value
text = text.Replace(strValue, "")
Dim int As Integer
int = System.Int32.Parse(strValue.Substring(2, 4), NumberStyles.HexNumber)
Dim ch As Char = ChrW(int)
docText.ReplacePattern(strValue, ch)
Else
Exit Do
End If
If Not text.Length < iLength Then
Exit Do
End If
Loop
selText.StartOfDocument()
End Sub
End Module

8.
如何判断一个文件是否为UTF-8编码
By sabbanji

  如果你要从一堆数据(假定无头部信息)来分辨是否符合某一格式(给出一个充要判断),唯一的办法是解码一遍。

  UTF-8的格式为:

  一字节:[0x00–0x7F]
  二字节:[0xC0–0xDF] [0x80–0xBF]
  三字节:[0xE0–0xEF] [0x80–0xBF] [0x80–0xBF]
  四字节:[0xF0–0xF7] [0x80–0xBF] [0x80–0xBF] [0x80–0xBF]
  非法字节:[0xF8–0xFF]

  这样,可以把字节空间分为六段:
  A:[0x00–0x7F]
  B:[0x80–0xBF]
  C:[0xC0–0xDF]
  D:[0xE0–0xEF]
  E:[0xF0–0xF7]
  F:[0xF8–0xFF]

  于是,上面的格式可以简写为:
  一字节:A
  二字节:CB
  三字节:DBB
  四字节:EBBB
  非法字节:F

  识别算法是一个有限自动机(FA)。记初始状态、成功(迄今为止是UTF-8格式)状态为SS;失败(已确定不是UTF-8格式)状态为SF。那么可以按下面的方法设立中间状态:

  一字节:SS A SS
  二字节:SS C S1 B SS
  三字节:SS D S2 B S3 B SS
  四字节:SS E S4 B S5 B S6 B SS
  非法字节:SS F SF

  容易知道这是一个确定有限自动机(DFA)。

  识别算法的伪码模板为:

  While NOT EOF
   BEGIN
    CASE SS:
     CASE A: Continue;
     CASE B: Enter SF; Break Loop;
     CASE C: Enter S1;
     CASE D: Enter S2;
     CASE E: Enter S4;
     CASE F: Enter SF; Break Loop;

    CASE S1:
     CASE A: Enter SF; Break Loop;
     CASE B: Enter SS;
     CASE C: Enter SF; Break Loop;
     CASE D: Enter SF; Break Loop;
     CASE E: Enter SF; Break Loop;
     CASE F: Enter SF; Break Loop;

    CASE S2:
     CASE A: Enter SF; Break Loop;
     CASE B: Enter S3;
     CASE C: Enter SF; Break Loop;
     CASE D: Enter SF; Break Loop;
     CASE E: Enter SF; Break Loop;
     CASE F: Enter SF; Break Loop;

    CASE S3:
     CASE A: Enter SF; Break Loop;
     CASE B: Enter SS;
     CASE C: Enter SF; Break Loop;
     CASE D: Enter SF; Break Loop;
     CASE E: Enter SF; Break Loop;
     CASE F: Enter SF; Break Loop;

    CASE S4:
     CASE A: Enter SF; Break Loop;
     CASE B: Enter S5;
     CASE C: Enter SF; Break Loop;
     CASE D: Enter SF; Break Loop;
     CASE E: Enter SF; Break Loop;
     CASE F: Enter SF; Break Loop;

    CASE S5:
     CASE A: Enter SF; Break Loop;
     CASE B: Enter S6;
     CASE C: Enter SF; Break Loop;
     CASE D: Enter SF; Break Loop;
     CASE E: Enter SF; Break Loop;
     CASE F: Enter SF; Break Loop;

    CASE S6:
     CASE A: Enter SF; Break Loop;
     CASE B: Enter SS;
     CASE C: Enter SF; Break Loop;
     CASE D: Enter SF; Break Loop;
     CASE E: Enter SF; Break Loop;
     CASE F: Enter SF; Break Loop;
   END;

  IF SS THEN YES ELSE NO;

  当然,可以大大优化一番。

  不用状态变量的实现方法:

SS:
if EOF then return TRUE
if within A then goto SS
if within B then return FALSE
if within C
then if EOF
then return FALSE
else if within B
then goto SS
else return FLASE
if within D
then if EOF
then return FALSE
else if within B
then if EOF
then return FALSE
else if within B
then goto SS
else return FALSE
else return FLASE
if within E
then if EOF
then return FALSE
else if within B
then if EOF
then return FALSE
else if within B
then if EOF
then return FALSE
else if within B
then goto SS
else return FALSE
else return FALSE
else return FLASE
return FALSE

  利用C/C++的&&算符的顺序计算和短路计算的特点,上面的伪码可以表示为:

bool is_UTF-8_Stream(FILE* f) {
unsigned char b;

SS:
if eof(f) then return TRUE;

b = nextbyte(f);

if(b within A) goto SS;

if(b within B) return FALSE;

if(b within C)
if ( (NOT eof(f)) && (nextbyte(f) within B)) goto SS;
else return FALSE;

if(b within D)
if ( (NOT eof(f)) && (nextbyte(f) within B)
&& (NOT eof(f)) && (nextbyte(f) within B)) goto SS;
else return FALSE;

if(b within E)
if ( (NOT eof(f)) && (nextbyte(f) within B)
&& (NOT eof(f)) && (nextbyte(f) within B)
&& (NOT eof(f)) && (nextbyte(f) within B)) goto SS;
else return FALSE;

return FALSE;
}

  最后的代码为(没有编译过):

#include < stdlib>

bool is_UTF-8_Stream(FILE* f) {

const int A_UPPER = 0x7F;
const int B_LOWER = 0x80;
const int B_UPPER = 0xBF;
const int C_UPPER = 0xDF;
const int D_UPPER = 0xEF;
const int E_UPPER = 0xF7;

int b;

SS:
if((b = getc(f)) == EOF) then return TRUE;

if(b < = A_UPPER) goto SS;

if(b < = B_UPPER) return FALSE;

if(b < = C_UPPER)
if( ((b = getc(f)) != EOF) && (b >= B_LOWER) && (b < = B_UPPER))
goto SS;
else return FALSE;

if(b < = D_UPPER)
if( ((b = getc(f)) != EOF) && (b >= B_LOWER) && (b < = B_UPPER)
&& ((b = getc(f)) != EOF) && (b >= B_LOWER) && (b < = B_UPPER))
goto SS;
else return FALSE;

if(b < = E_UPPER)
if( ((b = getc(f)) != EOF) && (b >= B_LOWER) && (b < = B_UPPER)
&& ((b = getc(f)) != EOF) && (b >= B_LOWER) && (b < = B_UPPER)
&& ((b = getc(f)) != EOF) && (b >= B_LOWER) && (b < = B_UPPER))
goto SS;

return FALSE;
}

  来看看(Addison) Unicode Demystified一书关于UTF-8的内容

UTF-8

UTF-8 is the 8-bit Unicode encoding form. It was designed to allow Unicode to be used in places that support only 8-bit character encodings. A Unicode code point is represented using a sequence of anywhere from one to four 8-bit code units.

One vitally important property of UTF-8 is that it's 100 percent backward compatible with ASCII. That is, valid 7-bit ASCII text is also valid UTF-8 text. As a consequence UTF-8 can be used in any environment that supports 8-bit ASCII-derived encodings, and that environment will be able to correctly interpret and display the 7-bit ASCII characters. (The characters represented by byte values where the most significant bit is set, of course, aren't backward compatible—they have a different representation in UTF-8 than they do in the legacy encodings.)

The only ambiguous thing here is the trailing byte. You can tell a byte is a trailing byte, but you can't tell which byte of the character it is, or how many bytes the character is. Because of UTF-8's design, however, you never have to scan forward or back more than three bytes to find out.

One side effect of UTF-8's design is that many code points can have more than one potential representation in UTF-8. For example, U+0041 could be represented not only as 0x41, but also as 0xC1 0x81 or 0xE0 0x81 0x81. The standard stipulates that the shortest possible representation for any code point is the only legal one. Prior to Unicode 3.1, it was legal for UTF-8 implementations to interpret these "non-shortest form" byte sequences. This possibility created a potential security hole, so Unicode 3.1 tightened the definition to disallow both interpretation and generation of nonshortest form sequences.

Another version of "non-shortest form" UTF-8 also needs to be recognized: representation of supplementary-plane characters using six-byte sequences. For example, instead of representing U+E0041 in UTF-8 as 0xF3 0xA0 0x81 0x81, as in the example above, you could conceivably represent it as 0xED 0xAD 0x80 0xED 0xB1 0x81. That is what you get if you convert it first to UTF-16, producing 0xDB40 0xDC41, and then convert the two surrogate code units to UTF-8.

Even Unicode 3.1's tightening of the UTF-8 definition still allowed this problem to occur. As with other non-shortest form sequences, it was illegal to produce such byte sequences, but legal to interpret them. Unicode 3.2 closes this loophole: UTF-8 sequences representing code point values in the surrogate range (U+D800 to U+DFFF) are now completely illegal. That is, any three-byte UTF-8 sequence whose first byte is 0xED and whose second byte is anything from 0xA0 to 0xBF is illegal.

As conversion of code points to UTF-8 results in a sequence of bytes, UTF-8 is effectively a character encoding scheme unto itself, and not just a character encoding form.

(按:UTF-8更加麻烦,程序员就是这么痛苦的挣扎着)



<< Home

This page is powered by Blogger. Isn't yours?