/*★ 文字コードを判別してデーターを入力 前田 稔 ★*/ using System; using System.IO; // for File, StreamReader using System.Text; // for Encoding class TextFileRead { public static int Main() { string[] file_name = { "C:\\data\\Test\\utf8.txt", "C:\\data\\Test\\utf8_bom.txt", "C:\\data\\Test\\utf16.txt", "C:\\data\\Test\\utf16LE.txt", "C:\\data\\Test\\utf16BE.txt", "C:\\data\\Test\\shift_jis.txt" }; byte[] bs; string str; System.Text.Encoding enc; for(int i=0; i<file_name.GetLength(0); i++) { Console.WriteLine("\n☆File Name : " + file_name[i]); bs = System.IO.File.ReadAllBytes(file_name[i]); enc = DetectEncodingFromBOM(bs); if (enc != null) { int bomLen = enc.GetPreamble().Length; str = enc.GetString(bs, bomLen, bs.Length - bomLen); } else { enc = GetCode(bs); if (enc == null) { Console.WriteLine("BOMが見つかりませんでした。"); continue; } //デコードして表示する str = enc.GetString(bs); } Console.Write(str); } return 0; } |
// BOMを調べて、文字コードを判別する。 // 見つからなかった時は、null public static System.Text.Encoding DetectEncodingFromBOM(byte[] bytes) { if (bytes.Length < 2) { return null; } if ((bytes[0] == 0xfe) && (bytes[1] == 0xff)) { //UTF-16 BE return new System.Text.UnicodeEncoding(true, true); } if ((bytes[0] == 0xff) && (bytes[1] == 0xfe)) { if ((4 <= bytes.Length) && (bytes[2] == 0x00) && (bytes[3] == 0x00)) { //UTF-32 LE return new System.Text.UTF32Encoding(false, true); } //UTF-16 LE return new System.Text.UnicodeEncoding(false, true); } if (bytes.Length < 3) { return null; } if ((bytes[0] == 0xef) && (bytes[1] == 0xbb) && (bytes[2] == 0xbf)) { //UTF-8 return new System.Text.UTF8Encoding(true, true); } if (bytes.Length < 4) { return null; } if ((bytes[0] == 0x00) && (bytes[1] == 0x00) && (bytes[2] == 0xfe) && (bytes[3] == 0xff)) { //UTF-32 BE return new System.Text.UTF32Encoding(true, true); } return null; } |
public static System.Text.Encoding GetCode(byte[] bytes) { const byte bEscape = 0x1B; const byte bAt = 0x40; const byte bDollar = 0x24; const byte bAnd = 0x26; const byte bOpen = 0x28; //'(' const byte bB = 0x42; const byte bD = 0x44; const byte bJ = 0x4A; const byte bI = 0x49; int len = bytes.Length; byte b1, b2, b3, b4; //Encode::is_utf8 は無視 bool isBinary = false; for (int i = 0; i < len; i++) { b1 = bytes[i]; if (b1 <= 0x06 || b1 == 0x7F || b1 == 0xFF) { //'binary' isBinary = true; if (b1 == 0x00 && i < len - 1 && bytes[i + 1] <= 0x7F) { //smells like raw unicode return System.Text.Encoding.Unicode; } } } if (isBinary) { return null; } //not Japanese bool notJapanese = true; for (int i = 0; i < len; i++) { b1 = bytes[i]; if (b1 == bEscape || 0x80 <= b1) { notJapanese = false; break; } } if (notJapanese) { return System.Text.Encoding.ASCII; } for (int i = 0; i < len - 2; i++) { b1 = bytes[i]; b2 = bytes[i + 1]; b3 = bytes[i + 2]; if (b1 == bEscape) { if (b2 == bDollar && b3 == bAt) { //JIS_0208 1978 //JIS return System.Text.Encoding.GetEncoding(50220); } else if (b2 == bDollar && b3 == bB) { //JIS_0208 1983 //JIS return System.Text.Encoding.GetEncoding(50220); } else if (b2 == bOpen && (b3 == bB || b3 == bJ)) { //JIS_ASC //JIS return System.Text.Encoding.GetEncoding(50220); } else if (b2 == bOpen && b3 == bI) { //JIS_KANA //JIS return System.Text.Encoding.GetEncoding(50220); } if (i < len - 3) { b4 = bytes[i + 3]; if (b2 == bDollar && b3 == bOpen && b4 == bD) { //JIS_0212 //JIS return System.Text.Encoding.GetEncoding(50220); } if (i < len - 5 && b2 == bAnd && b3 == bAt && b4 == bEscape && bytes[i + 4] == bDollar && bytes[i + 5] == bB) { //JIS_0208 1990 //JIS return System.Text.Encoding.GetEncoding(50220); } } } } //should be euc|sjis|utf8 //use of (?:) by Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp> int sjis = 0; int euc = 0; int utf8 = 0; for (int i = 0; i < len - 1; i++) { b1 = bytes[i]; b2 = bytes[i + 1]; if (((0x81 <= b1 && b1 <= 0x9F) || (0xE0 <= b1 && b1 <= 0xFC)) && ((0x40 <= b2 && b2 <= 0x7E) || (0x80 <= b2 && b2 <= 0xFC))) { //SJIS_C sjis += 2; i++; } } for (int i = 0; i < len - 1; i++) { b1 = bytes[i]; b2 = bytes[i + 1]; if (((0xA1 <= b1 && b1 <= 0xFE) && (0xA1 <= b2 && b2 <= 0xFE)) || (b1 == 0x8E && (0xA1 <= b2 && b2 <= 0xDF))) { //EUC_C //EUC_KANA euc += 2; i++; } else if (i < len - 2) { b3 = bytes[i + 2]; if (b1 == 0x8F && (0xA1 <= b2 && b2 <= 0xFE) && (0xA1 <= b3 && b3 <= 0xFE)) { //EUC_0212 euc += 3; i += 2; } } } for (int i = 0; i < len - 1; i++) { b1 = bytes[i]; b2 = bytes[i + 1]; if ((0xC0 <= b1 && b1 <= 0xDF) && (0x80 <= b2 && b2 <= 0xBF)) { //UTF8 utf8 += 2; i++; } else if (i < len - 2) { b3 = bytes[i + 2]; if ((0xE0 <= b1 && b1 <= 0xEF) && (0x80 <= b2 && b2 <= 0xBF) && (0x80 <= b3 && b3 <= 0xBF)) { //UTF8 utf8 += 3; i += 2; } } } //M. Takahashi's suggestion //utf8 += utf8 / 2; System.Diagnostics.Debug.WriteLine( string.Format("sjis = {0}, euc = {1}, utf8 = {2}", sjis, euc, utf8)); if (euc > sjis && euc > utf8) { //EUC return System.Text.Encoding.GetEncoding(51932); } else if (sjis > euc && sjis > utf8) { //SJIS return System.Text.Encoding.GetEncoding(932); } else if (utf8 > euc && utf8 > sjis) { //UTF8 return System.Text.Encoding.UTF8; } return null; } } |
ファイル名 | 文字コード | BOM |
---|---|---|
utf8.txt | utf-8 | BOM 無し |
utf8_BOM.txt | utf-8 | BOM 有り |
utf16.txt | utf-16 | BOM 有り |
utf16LE.txt | utf-16LE | BOM 有り |
utf16BE.txt | utf-16BE | BOM 有り |
shift_jis.txt | Shift_JIS | BOM 無し |
bs = System.IO.File.ReadAllBytes(file_name[i]); enc = DetectEncodingFromBOM(bs); if (enc != null) { int bomLen = enc.GetPreamble().Length; str = enc.GetString(bs, bomLen, bs.Length - bomLen); } else { enc = GetCode(bs); if (enc == null) { Console.WriteLine("BOMが見つかりませんでした。"); continue; } //デコードして表示する str = enc.GetString(bs); } |