/*★ 文字コードを判別してデーターを入力 前田 稔 ★*/
using System;
using System.IO; // for File, StreamReader
using System.Text; // for Encoding
class TextFileRead
{
public static int Main()
{
string[] file_name = {
"C:\\data\\Test\\utf8.txt", "C:\\data\\Test\\utf8_bom.txt", "C:\\data\\Test\\utf16.txt",
"C:\\data\\Test\\utf16LE.txt", "C:\\data\\Test\\utf16BE.txt", "C:\\data\\Test\\shift_jis.txt" };
byte[] bs;
string str;
System.Text.Encoding enc;
for(int i=0; i<file_name.GetLength(0); i++)
{
Console.WriteLine("\n☆File Name : " + file_name[i]);
bs = System.IO.File.ReadAllBytes(file_name[i]);
enc = DetectEncodingFromBOM(bs);
if (enc != null)
{
int bomLen = enc.GetPreamble().Length;
str = enc.GetString(bs, bomLen, bs.Length - bomLen);
}
else
{
enc = GetCode(bs);
if (enc == null)
{ Console.WriteLine("BOMが見つかりませんでした。");
continue;
}
//デコードして表示する
str = enc.GetString(bs);
}
Console.Write(str);
}
return 0;
}
|
// BOMを調べて、文字コードを判別する。
// 見つからなかった時は、null
public static System.Text.Encoding DetectEncodingFromBOM(byte[] bytes)
{
if (bytes.Length < 2)
{
return null;
}
if ((bytes[0] == 0xfe) && (bytes[1] == 0xff))
{
//UTF-16 BE
return new System.Text.UnicodeEncoding(true, true);
}
if ((bytes[0] == 0xff) && (bytes[1] == 0xfe))
{
if ((4 <= bytes.Length) &&
(bytes[2] == 0x00) && (bytes[3] == 0x00))
{
//UTF-32 LE
return new System.Text.UTF32Encoding(false, true);
}
//UTF-16 LE
return new System.Text.UnicodeEncoding(false, true);
}
if (bytes.Length < 3)
{
return null;
}
if ((bytes[0] == 0xef) && (bytes[1] == 0xbb) && (bytes[2] == 0xbf))
{
//UTF-8
return new System.Text.UTF8Encoding(true, true);
}
if (bytes.Length < 4)
{
return null;
}
if ((bytes[0] == 0x00) && (bytes[1] == 0x00) &&
(bytes[2] == 0xfe) && (bytes[3] == 0xff))
{
//UTF-32 BE
return new System.Text.UTF32Encoding(true, true);
}
return null;
}
|
public static System.Text.Encoding GetCode(byte[] bytes)
{
const byte bEscape = 0x1B;
const byte bAt = 0x40;
const byte bDollar = 0x24;
const byte bAnd = 0x26;
const byte bOpen = 0x28; //'('
const byte bB = 0x42;
const byte bD = 0x44;
const byte bJ = 0x4A;
const byte bI = 0x49;
int len = bytes.Length;
byte b1, b2, b3, b4;
//Encode::is_utf8 は無視
bool isBinary = false;
for (int i = 0; i < len; i++)
{
b1 = bytes[i];
if (b1 <= 0x06 || b1 == 0x7F || b1 == 0xFF)
{
//'binary'
isBinary = true;
if (b1 == 0x00 && i < len - 1 && bytes[i + 1] <= 0x7F)
{
//smells like raw unicode
return System.Text.Encoding.Unicode;
}
}
}
if (isBinary)
{
return null;
}
//not Japanese
bool notJapanese = true;
for (int i = 0; i < len; i++)
{
b1 = bytes[i];
if (b1 == bEscape || 0x80 <= b1)
{
notJapanese = false;
break;
}
}
if (notJapanese)
{
return System.Text.Encoding.ASCII;
}
for (int i = 0; i < len - 2; i++)
{
b1 = bytes[i];
b2 = bytes[i + 1];
b3 = bytes[i + 2];
if (b1 == bEscape)
{
if (b2 == bDollar && b3 == bAt)
{
//JIS_0208 1978
//JIS
return System.Text.Encoding.GetEncoding(50220);
}
else if (b2 == bDollar && b3 == bB)
{
//JIS_0208 1983
//JIS
return System.Text.Encoding.GetEncoding(50220);
}
else if (b2 == bOpen && (b3 == bB || b3 == bJ))
{
//JIS_ASC
//JIS
return System.Text.Encoding.GetEncoding(50220);
}
else if (b2 == bOpen && b3 == bI)
{
//JIS_KANA
//JIS
return System.Text.Encoding.GetEncoding(50220);
}
if (i < len - 3)
{
b4 = bytes[i + 3];
if (b2 == bDollar && b3 == bOpen && b4 == bD)
{
//JIS_0212
//JIS
return System.Text.Encoding.GetEncoding(50220);
}
if (i < len - 5 &&
b2 == bAnd && b3 == bAt && b4 == bEscape &&
bytes[i + 4] == bDollar && bytes[i + 5] == bB)
{
//JIS_0208 1990
//JIS
return System.Text.Encoding.GetEncoding(50220);
}
}
}
}
//should be euc|sjis|utf8
//use of (?:) by Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp>
int sjis = 0;
int euc = 0;
int utf8 = 0;
for (int i = 0; i < len - 1; i++)
{
b1 = bytes[i];
b2 = bytes[i + 1];
if (((0x81 <= b1 && b1 <= 0x9F) || (0xE0 <= b1 && b1 <= 0xFC)) &&
((0x40 <= b2 && b2 <= 0x7E) || (0x80 <= b2 && b2 <= 0xFC)))
{
//SJIS_C
sjis += 2;
i++;
}
}
for (int i = 0; i < len - 1; i++)
{
b1 = bytes[i];
b2 = bytes[i + 1];
if (((0xA1 <= b1 && b1 <= 0xFE) && (0xA1 <= b2 && b2 <= 0xFE)) ||
(b1 == 0x8E && (0xA1 <= b2 && b2 <= 0xDF)))
{
//EUC_C
//EUC_KANA
euc += 2;
i++;
}
else if (i < len - 2)
{
b3 = bytes[i + 2];
if (b1 == 0x8F && (0xA1 <= b2 && b2 <= 0xFE) &&
(0xA1 <= b3 && b3 <= 0xFE))
{
//EUC_0212
euc += 3;
i += 2;
}
}
}
for (int i = 0; i < len - 1; i++)
{
b1 = bytes[i];
b2 = bytes[i + 1];
if ((0xC0 <= b1 && b1 <= 0xDF) && (0x80 <= b2 && b2 <= 0xBF))
{
//UTF8
utf8 += 2;
i++;
}
else if (i < len - 2)
{
b3 = bytes[i + 2];
if ((0xE0 <= b1 && b1 <= 0xEF) && (0x80 <= b2 && b2 <= 0xBF) &&
(0x80 <= b3 && b3 <= 0xBF))
{
//UTF8
utf8 += 3;
i += 2;
}
}
}
//M. Takahashi's suggestion
//utf8 += utf8 / 2;
System.Diagnostics.Debug.WriteLine(
string.Format("sjis = {0}, euc = {1}, utf8 = {2}", sjis, euc, utf8));
if (euc > sjis && euc > utf8)
{
//EUC
return System.Text.Encoding.GetEncoding(51932);
}
else if (sjis > euc && sjis > utf8)
{
//SJIS
return System.Text.Encoding.GetEncoding(932);
}
else if (utf8 > euc && utf8 > sjis)
{
//UTF8
return System.Text.Encoding.UTF8;
}
return null;
}
}
|
| ファイル名 | 文字コード | BOM |
|---|---|---|
| utf8.txt | utf-8 | BOM 無し |
| utf8_BOM.txt | utf-8 | BOM 有り |
| utf16.txt | utf-16 | BOM 有り |
| utf16LE.txt | utf-16LE | BOM 有り |
| utf16BE.txt | utf-16BE | BOM 有り |
| shift_jis.txt | Shift_JIS | BOM 無し |
bs = System.IO.File.ReadAllBytes(file_name[i]);
enc = DetectEncodingFromBOM(bs);
if (enc != null)
{
int bomLen = enc.GetPreamble().Length;
str = enc.GetString(bs, bomLen, bs.Length - bomLen);
}
else
{
enc = GetCode(bs);
if (enc == null)
{ Console.WriteLine("BOMが見つかりませんでした。");
continue;
}
//デコードして表示する
str = enc.GetString(bs);
}
|
![]()