如何检查一个文本文件时gb2312编码还是utf-8 without BOM 编码?因为无BOM的UTF-8编码不知道怎么检测。 35
3个回答
展开全部
判断不带bom的UTF-8文件,只要符合下面的编码格式就行,附上一个方法
//00000000 - 0000007F 0xxxxxxx
//00000080 - 000007FF 110xxxxx 10xxxxxx
//00000800 - 0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
//00010000 - 001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//00200000 - 03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
//04000000 - 7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
方法:(还有个方法正在试)
private static bool IsUTF8(FileStream fileStream)
{
BinaryReader binaryReader = new BinaryReader(fileStream);
int fileStreamLength = (int)fileStream.Length;
byte[] bytes = binaryReader.ReadBytes(fileStreamLength);
int index = 0;
int ascN = 0;
while (index < fileStreamLength)
{
string binaryString = getBinaryStrFromByte(bytes[index]);
int indexOf0 = binaryString.IndexOf("0");
if (indexOf0 == 0)
{
index += 1;
ascN += 1;
}
else if (indexOf0 > 1 && indexOf0 < 7)
{
for (int i = 1; i < indexOf0; i++)
{
if (!getBinaryStrFromByte(bytes[index + i]).StartsWith("10"))
{
return false;
}
}
index += indexOf0;
}
else
{
return false;
}
}
if (ascN == fileStreamLength)
{
return false;
}
else
{
return true;
}
}
private static string getBinaryStrFromByte(byte byteValue)
{
string result = null;
for (int i = 0; i < 8; i++)
{
result = (byteValue % 2) + result;
byteValue = (byte)(byteValue / 2);
}
return result;
}
方法二:(较好)
private static bool IsUTF8Byte(FileStream fileStream)
{
BinaryReader binaryReader = new BinaryReader(fileStream);
int fileStreamLength = (int)fileStream.Length;
byte[] bytes = binaryReader.ReadBytes(fileStreamLength);
int charByteCounter = 1;
byte curByte;
for (int i = 0; i < fileStreamLength; i++)
{
curByte = bytes[i];
if (charByteCounter == 1)
{
if (curByte >= 0x80)
{
while (((curByte <<= 1) & 0x80) != 0)
{
charByteCounter++;
}
if (charByteCounter == 1 || charByteCounter > 6)
{
return false;
}
}
}
else if (charByteCounter > 1)
{
if ((curByte & 0xC0) != 0x80)
{
return false;
}
charByteCounter--;
}
else
{
return false;
}
}
if (charByteCounter != 1)
{
return false;
}
return true;
}
//00000000 - 0000007F 0xxxxxxx
//00000080 - 000007FF 110xxxxx 10xxxxxx
//00000800 - 0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
//00010000 - 001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//00200000 - 03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
//04000000 - 7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
方法:(还有个方法正在试)
private static bool IsUTF8(FileStream fileStream)
{
BinaryReader binaryReader = new BinaryReader(fileStream);
int fileStreamLength = (int)fileStream.Length;
byte[] bytes = binaryReader.ReadBytes(fileStreamLength);
int index = 0;
int ascN = 0;
while (index < fileStreamLength)
{
string binaryString = getBinaryStrFromByte(bytes[index]);
int indexOf0 = binaryString.IndexOf("0");
if (indexOf0 == 0)
{
index += 1;
ascN += 1;
}
else if (indexOf0 > 1 && indexOf0 < 7)
{
for (int i = 1; i < indexOf0; i++)
{
if (!getBinaryStrFromByte(bytes[index + i]).StartsWith("10"))
{
return false;
}
}
index += indexOf0;
}
else
{
return false;
}
}
if (ascN == fileStreamLength)
{
return false;
}
else
{
return true;
}
}
private static string getBinaryStrFromByte(byte byteValue)
{
string result = null;
for (int i = 0; i < 8; i++)
{
result = (byteValue % 2) + result;
byteValue = (byte)(byteValue / 2);
}
return result;
}
方法二:(较好)
private static bool IsUTF8Byte(FileStream fileStream)
{
BinaryReader binaryReader = new BinaryReader(fileStream);
int fileStreamLength = (int)fileStream.Length;
byte[] bytes = binaryReader.ReadBytes(fileStreamLength);
int charByteCounter = 1;
byte curByte;
for (int i = 0; i < fileStreamLength; i++)
{
curByte = bytes[i];
if (charByteCounter == 1)
{
if (curByte >= 0x80)
{
while (((curByte <<= 1) & 0x80) != 0)
{
charByteCounter++;
}
if (charByteCounter == 1 || charByteCounter > 6)
{
return false;
}
}
}
else if (charByteCounter > 1)
{
if ((curByte & 0xC0) != 0x80)
{
return false;
}
charByteCounter--;
}
else
{
return false;
}
}
if (charByteCounter != 1)
{
return false;
}
return true;
}
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询