文字コードを調べて入力

ファイルの文字コードを調べて入力します。

プログラムの説明

Text File とは、Ｃ言語やＣ＃のソースプログラムのように、文字コードを使って書かれたファイルのことを言います。
文字コードには utf-8, utf-16, Shift_JIS など色々な文字コードが使われています。
ファイルの文字コードを調べて入力します。
文字コードの説明は Store Guid を参照して下さい。

文字コードを調べてファイルを入力する Check_Code.cs です。
C:\DATA\C#\BAT のフォルダーに格納して下さい。

/*★ 文字コードを判別してデーターを入力    前田 稔 ★*/
using System;
using System.IO;    // for File, StreamReader
using System.Text;  // for Encoding

class TextFileRead
{
    public static int Main()
    {
        string[] file_name = {
            "C:\\data\\Test\\utf8.txt", "C:\\data\\Test\\utf8_bom.txt", "C:\\data\\Test\\utf16.txt",
            "C:\\data\\Test\\utf16LE.txt", "C:\\data\\Test\\utf16BE.txt", "C:\\data\\Test\\shift_jis.txt" };  
        byte[] bs;
        string str;
        System.Text.Encoding enc;

        for(int i=0; i<file_name.GetLength(0); i++)
        {
Console.WriteLine("\n☆File Name : " + file_name[i]);
            bs = System.IO.File.ReadAllBytes(file_name[i]);
            enc = DetectEncodingFromBOM(bs);
            if (enc != null)
            {
                int bomLen = enc.GetPreamble().Length;
                str = enc.GetString(bs, bomLen, bs.Length - bomLen);
            }
            else
            {
                enc = GetCode(bs);
                if (enc == null)
                {   Console.WriteLine("BOMが見つかりませんでした。");
                    continue;
                }
                //デコードして表示する
                str = enc.GetString(bs);
            }
Console.Write(str);
        }
        return 0;
    }

BOMを調べて、文字コードを判別する関数です。

// BOMを調べて、文字コードを判別する。
// 見つからなかった時は、null
public static System.Text.Encoding DetectEncodingFromBOM(byte[] bytes)
{
    if (bytes.Length < 2)
    {
        return null;
    }
    if ((bytes[0] == 0xfe) && (bytes[1] == 0xff))
    {
        //UTF-16 BE
        return new System.Text.UnicodeEncoding(true, true);
    }
    if ((bytes[0] == 0xff) && (bytes[1] == 0xfe))
    {
        if ((4 <= bytes.Length) &&
            (bytes[2] == 0x00) && (bytes[3] == 0x00))
        {
            //UTF-32 LE
            return new System.Text.UTF32Encoding(false, true);
        }
        //UTF-16 LE
        return new System.Text.UnicodeEncoding(false, true);
    }
    if (bytes.Length < 3)
    {
        return null;
    }
    if ((bytes[0] == 0xef) && (bytes[1] == 0xbb) && (bytes[2] == 0xbf))
    {
        //UTF-8
        return new System.Text.UTF8Encoding(true, true);
    }
    if (bytes.Length < 4)
    {
        return null;
    }
    if ((bytes[0] == 0x00) && (bytes[1] == 0x00) &&
        (bytes[2] == 0xfe) && (bytes[3] == 0xff))
    {
        //UTF-32 BE
        return new System.Text.UTF32Encoding(true, true);
    }

    return null;
}

BOM が無い時、文字を解析して文字コードを推測する関数です。

public static System.Text.Encoding GetCode(byte[] bytes)
{
    const byte bEscape = 0x1B;
    const byte bAt = 0x40;
    const byte bDollar = 0x24;
    const byte bAnd = 0x26;
    const byte bOpen = 0x28;    //'('
    const byte bB = 0x42;
    const byte bD = 0x44;
    const byte bJ = 0x4A;
    const byte bI = 0x49;

    int len = bytes.Length;
    byte b1, b2, b3, b4;

    //Encode::is_utf8 は無視

    bool isBinary = false;
    for (int i = 0; i < len; i++)
    {
        b1 = bytes[i];
        if (b1 <= 0x06 || b1 == 0x7F || b1 == 0xFF)
        {
            //'binary'
            isBinary = true;
            if (b1 == 0x00 && i < len - 1 && bytes[i + 1] <= 0x7F)
            {
                //smells like raw unicode
                return System.Text.Encoding.Unicode;
            }
        }
    }
    if (isBinary)
    {
        return null;
    }

    //not Japanese
    bool notJapanese = true;
    for (int i = 0; i < len; i++)
    {
        b1 = bytes[i];
        if (b1 == bEscape || 0x80 <= b1)
        {
            notJapanese = false;
            break;
        }
    }
    if (notJapanese)
    {
        return System.Text.Encoding.ASCII;
    }

    for (int i = 0; i < len - 2; i++)
    {
        b1 = bytes[i];
        b2 = bytes[i + 1];
        b3 = bytes[i + 2];

        if (b1 == bEscape)
        {
            if (b2 == bDollar && b3 == bAt)
            {
                //JIS_0208 1978
                //JIS
                return System.Text.Encoding.GetEncoding(50220);
            }
            else if (b2 == bDollar && b3 == bB)
            {
                //JIS_0208 1983
                //JIS
                return System.Text.Encoding.GetEncoding(50220);
            }
            else if (b2 == bOpen && (b3 == bB || b3 == bJ))
            {
                //JIS_ASC
                //JIS
                return System.Text.Encoding.GetEncoding(50220);
            }
            else if (b2 == bOpen && b3 == bI)
            {
                //JIS_KANA
                //JIS
                return System.Text.Encoding.GetEncoding(50220);
            }
            if (i < len - 3)
            {
                b4 = bytes[i + 3];
                if (b2 == bDollar && b3 == bOpen && b4 == bD)
                {
                    //JIS_0212
                    //JIS
                    return System.Text.Encoding.GetEncoding(50220);
                }
                if (i < len - 5 &&
                    b2 == bAnd && b3 == bAt && b4 == bEscape &&
                    bytes[i + 4] == bDollar && bytes[i + 5] == bB)
                {
                    //JIS_0208 1990
                    //JIS
                    return System.Text.Encoding.GetEncoding(50220);
                }
            }
        }
    }

    //should be euc|sjis|utf8
    //use of (?:) by Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp>
    int sjis = 0;
    int euc = 0;
    int utf8 = 0;
    for (int i = 0; i < len - 1; i++)
    {
        b1 = bytes[i];
        b2 = bytes[i + 1];
        if (((0x81 <= b1 && b1 <= 0x9F) || (0xE0 <= b1 && b1 <= 0xFC)) &&
            ((0x40 <= b2 && b2 <= 0x7E) || (0x80 <= b2 && b2 <= 0xFC)))
        {
            //SJIS_C
            sjis += 2;
            i++;
        }
    }
    for (int i = 0; i < len - 1; i++)
    {
        b1 = bytes[i];
        b2 = bytes[i + 1];
        if (((0xA1 <= b1 && b1 <= 0xFE) && (0xA1 <= b2 && b2 <= 0xFE)) ||
            (b1 == 0x8E && (0xA1 <= b2 && b2 <= 0xDF)))
        {
            //EUC_C
            //EUC_KANA
            euc += 2;
            i++;
        }
        else if (i < len - 2)
        {
            b3 = bytes[i + 2];
            if (b1 == 0x8F && (0xA1 <= b2 && b2 <= 0xFE) &&
                (0xA1 <= b3 && b3 <= 0xFE))
            {
                //EUC_0212
                euc += 3;
                i += 2;
            }
        }
    }
    for (int i = 0; i < len - 1; i++)
    {
        b1 = bytes[i];
        b2 = bytes[i + 1];
        if ((0xC0 <= b1 && b1 <= 0xDF) && (0x80 <= b2 && b2 <= 0xBF))
        {
            //UTF8
            utf8 += 2;
            i++;
        }
        else if (i < len - 2)
        {
            b3 = bytes[i + 2];
            if ((0xE0 <= b1 && b1 <= 0xEF) && (0x80 <= b2 && b2 <= 0xBF) &&
                (0x80 <= b3 && b3 <= 0xBF))
            {
                //UTF8
                utf8 += 3;
                i += 2;
            }
        }
    }
    //M. Takahashi's suggestion
    //utf8 += utf8 / 2;

    System.Diagnostics.Debug.WriteLine(
        string.Format("sjis = {0}, euc = {1}, utf8 = {2}", sjis, euc, utf8));
    if (euc > sjis && euc > utf8)
    {
        //EUC
        return System.Text.Encoding.GetEncoding(51932);
    }
    else if (sjis > euc && sjis > utf8)
    {
        //SJIS
        return System.Text.Encoding.GetEncoding(932);
    }
    else if (utf8 > euc && utf8 > sjis)
    {
        //UTF8
        return System.Text.Encoding.UTF8;
    }

    return null;
}
}

Check_Code.cs を CLI でコンパイルします。
Windows10 のスタートアイコンから[Microsoft Visual Studio 2005][Visual Studio 2005 コマンドプロンプト] を起動します。
ここから起動するとコンパイル環境が設定されます。
cd コマンドで C:\DATA\C#\BAT のフォルダーに移動します。
csc Check_Code.cs でコンパイルします。
Check_Code.exe で実行します。
詳細は CLI で操作を参照して下さい。

入力するファイル名を file_name[] 配列で定義しています。

ファイル名	文字コード	BOM
utf8.txt	utf-8	BOM 無し
utf8_BOM.txt	utf-8	BOM 有り
utf16.txt	utf-16	BOM 有り
utf16LE.txt	utf-16LE	BOM 有り
utf16BE.txt	utf-16BE	BOM 有り
shift_jis.txt	Shift_JIS	BOM 無し

最初に BOM で文字コードを調べます。
BOM が見つからなかったとき、文字を解析して文字コードを推測します。

            bs = System.IO.File.ReadAllBytes(file_name[i]);
            enc = DetectEncodingFromBOM(bs);
            if (enc != null)
            {
                int bomLen = enc.GetPreamble().Length;
                str = enc.GetString(bs, bomLen, bs.Length - bomLen);
            }
            else
            {
                enc = GetCode(bs);
                if (enc == null)
                {   Console.WriteLine("BOMが見つかりませんでした。");
                    continue;
                }
                //デコードして表示する
                str = enc.GetString(bs);
            }

file_name[] 配列で定義した全てのファイルを文字化けすることなく印字することが出来ました。
文字コードを調べる関数は文字コードを判別するに掲載されていたものを使わせていただきました。

超初心者のプログラム入門(C# Frame Work)