c语言怎么把UTF-8转换成Unicode

如题，要求把2字节或3字节的字符的UTF-8转换成Unicode,求源代码... 如题，要求把2字节或3字节的字符的UTF-8转换成Unicode,求源代码展开

 我来答

4个回答

#热议# 网上掀起『练心眼子』风潮，真的能提高情商吗？

娱乐小八卦啊a

高粉答主

2020-02-16 · 娱乐小八卦，天天都知道

娱乐小八卦啊a

采纳数：256 获赞数：117852

向TA提问私信TA

关注

展开全部

下面程序给出的是UTF-8转成Unicode（UCS-2）的函数：

#include <stdio.h>

#include <stdlib.h>

#include <memory.h>

#include <string.h>

int utf8_to_unicode(char* pInput, char** ppOutput)

{

int outputSize = 0; //记录转换后的Unicode字符串的字节数

*ppOutput = (char *)malloc(strlen(pInput) * 2); //为输出字符串分配足够大的内存空

memset(*ppOutput, 0, strlen(pInput) * 2);

char *tmp = *ppOutput; //临时变量，用于遍历输出字符串

while (*pInput)

{

if (*pInput > 0x00 && *pInput <= 0x7F) //处理单字节UTF8字符（英文字母、数字）

{

*tmp = *pInput;

tmp++;

*tmp = 0; //小端法表示，在高地址填补0

}

else if (((*pInput) & 0xE0) == 0xC0) //处理双字节UTF8字符

{

char high = *pInput;

pInput++;

char low = *pInput;

if ((low & 0xC0) != 0x80) //检查是否为合法的UTF8字符表示

{

return -1; //如果不是则报错

}

*tmp = (high << 6) + (low & 0x3F);

tmp++；

*tmp = (high >> 2) & 0x07;

}

else if (((*pInput) & 0xF0) == 0xE0) //处理三字节UTF8字符

{

char high = *pInput;

pInput++;

char middle = *pInput;

pInput++;

char low = *pInput;

if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80))

{

return -1;

}

*tmp = (middle << 6) + (low & 0x7F);

tmp++;

*tmp = (high << 4) + ((middle >> 2) & 0x0F);

}

else //对于其他字节数的UTF8字符不进行处理

{

return -1;

}

pInput ++;

tmp ++;

outputSize += 2;

}

*tmp = 0;

tmp++;

*tmp = 0;

return outputSize;

}

扩展资料

UTF-8：互联网的普及，强烈要求出现一种统一的编码方式。 UTF-8就是在互联网上使用最广的一种unicode的实现方式。其他实现方式还包括UTF-16和UTF-32，不过在互联网上基本不用。重复一遍，这里的关系是，UTF-8是Unicode的实现方式之一。

UTF-8最大的一个特点，就是它是一种变长的编码方式。它可以使用1~6个字节表示一个符号，根据不同的符号而变化字节长度。

UTF-8的编码规则：

UTF-8的编码规则很简单，只有两条：

1、对于单字节的符号，字节的第一位设为0，后面7位为这个符号的unicode码。因此对于英语字母，UTF-8编码和ASCII码是相同的。

2、对于n字节的符号(n>1)，第一个字节的前n位都设为1，第n+1位设为0，后面字节的前两位一律设为10。剩下的没有提及的二进制位，全部为这个符号的unicode码。

已赞过 已踩过<

评论收起

匿名用户
推荐于2017-11-28

展开全部

Unicode有二字节编码（UCS-2）和四字节编码（UCS-4）两种，目前常用的二字节编码。下面程序给出的是UTF-8转成Unicode（UCS-2）的函数。 #include <stdio.h>#include <stdlib.h>
#include <memory.h>
#include <string.h> /************************************************************************************************* * 将UTF8编码转换成Unicode（UCS-2）编码 * 参数： *    char* pInput  指向输入字符串（以'\0'结尾）的指针  *    char** ppOutput 指向输出字符串指针的指针 * 返回值： *    返回转换后的Unicode字符串的字节数，如果出错则返回-1 * 注意： *     1. UTF8没有字节序问题，但是Unicode字符有字节序， *        字节序分为大端（Big Endian）和小端（Little Endian）两种， *        在Intel处理器中采用小端法表示，因此本例中采用小端法表示。（低地址存低位） *     2. 在调用本函数后需要手动释放 *ppOutput 指向的内存，否则将会造成内存泄漏。 **************************************************************************************************/int utf8_to_unicode(char* pInput, char** ppOutput)
{
    int outputSize = 0; //记录转换后的Unicode字符串的字节数    *ppOutput = (char *)malloc(strlen(pInput) * 2);  //为输出字符串分配足够大的内存空间
    memset(*ppOutput, 0, strlen(pInput) * 2);
    char *tmp = *ppOutput; //临时变量，用于遍历输出字符串
 
    while (*pInput)
    {
        if (*pInput > 0x00 && *pInput <= 0x7F) //处理单字节UTF8字符（英文字母、数字）
        {
            *tmp = *pInput;
            tmp++;
            *tmp = 0; //小端法表示，在高地址填补0
        }
        else if (((*pInput) & 0xE0) == 0xC0) //处理双字节UTF8字符
        {
            char high = *pInput;
            pInput++;
            char low = *pInput;            if ((low & 0xC0) != 0x80)  //检查是否为合法的UTF8字符表示
            {
                return -1; //如果不是则报错
            }
   
            *tmp = (high << 6) + (low & 0x3F);
            tmp++;
            *tmp = (high >> 2) & 0x07;
        }
        else if (((*pInput) & 0xF0) == 0xE0) //处理三字节UTF8字符
        {
            char high = *pInput;
            pInput++;
            char middle = *pInput;
            pInput++;
            char low = *pInput;            if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80))
            {
                return -1;
            }            *tmp = (middle << 6) + (low & 0x7F);
            tmp++;
            *tmp = (high << 4) + ((middle >> 2) & 0x0F); 
        }
        else //对于其他字节数的UTF8字符不进行处理
        {
            return -1;
        }        pInput ++;
        tmp ++;
        outputSize += 2;
    }    *tmp = 0;
    tmp++;
    *tmp = 0;    return outputSize;
} //一个调用示例int main(int argc, char** argv)
{    //汉字“我”的UTF8编码是 E6 88 91，Unicode编码是 62 11
    char str[4] = {(char)0xE6, (char)0x88, (char)0x91, (char)0x00}; 
    char* uni = NULL;    int num = utf8_to_unicode(str, &uni);    if (num == -1)
    {
        printf("Error!\n");
    }
    else
    {
        char* p = uni;
        for (int i = 0; i < num; i++)
        {
            printf("%02X", *p);
            p++;
        }
        printf("\n");
    }    free(uni); //释放内存    return 0;
}


本回答被网友采纳






已赞过已踩过<

你对这个回答的评价是？
评论收起

匿名用户
2013-04-03

展开全部

int utf8ToUnicode( char *s, int *bytes ) { if ( ( 0x80 & s[0] ) == 0 )//一个字节的UTF-8 { return s[0]; } int a, b; if ( ( 0xE0 & s[0] ) == 0xC0 )//两字节的UTF-8 { *bytes = 2; a = ( 0x1F & s[0] ) << 6; b = 0x3F & s[1]; return a + b; } int c; if ( ( 0xF0 & s[0] ) == 0xE0 )//三字节的UTF-8 { *bytes = 3; a = ( 0x0F & s[0] ) << 12; b = ( 0x3F & s[1] ) << 6; c = 0x3F & s[2]; return a + b + c; } return 0; }

已赞过 已踩过<

评论收起

百度网友d18d2bc
2013-04-02 · 超过16用户采纳过TA的回答

知道答主

回答量：63

采纳率：0%

帮助的人：39.8万

我也去答题访问个人页

关注

展开全部

UTF-8？？是ASCII吧

已赞过 已踩过<

评论收起

1条折叠回答

更多回答（2）

推荐律师服务：若未解决您的问题，请您详细描述您的问题，通过百度律临进行免费专业咨询

c语言怎么把UTF-8转换成Unicode

其他类似问题

为你推荐：