From 89ae4a79104de15a20d61546e724e8f1be586e93 Mon Sep 17 00:00:00 2001
From: niel-archer <niel-archer@sourceforge.net>
Date: Thu, 2 Feb 2012 06:56:39 +0000
Subject: [PATCH] Initial import of TextFileEncodingDetector class.

---
 trunk/Helpers/TextFileEncodingDetector.cs | 453 ++++++++++++++++++++++
 1 file changed, 453 insertions(+)
 create mode 100644 trunk/Helpers/TextFileEncodingDetector.cs

diff --git a/trunk/Helpers/TextFileEncodingDetector.cs b/trunk/Helpers/TextFileEncodingDetector.cs
new file mode 100644
index 0000000..7905a5e
--- /dev/null
+++ b/trunk/Helpers/TextFileEncodingDetector.cs
@@ -0,0 +1,453 @@
+﻿using System;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.IO;
+
+namespace LSLEditor.Helpers
+{
+	public static class TextFileEncodingDetector
+	{
+		/*
+		 * Simple class to handle text file encoding woes (in a primarily English-speaking tech
+		 *      world).
+		 *
+		 *  - This code is fully managed, no shady calls to MLang (the unmanaged codepage
+		 *      detection library originally developed for Internet Explorer).
+		 *
+		 *  - This class does NOT try to detect arbitrary codepages/charsets, it really only
+		 *      aims to differentiate between some of the most common variants of Unicode
+		 *      encoding, and a "default" (western / ascii-based) encoding alternative provided
+		 *      by the caller.
+		 *
+		 *  - As there is no "Reliable" way to distinguish between UTF-8 (without BOM) and
+		 *      Windows-1252 (in .Net, also incorrectly called "ASCII") encodings, we use a
+		 *      heuristic - so the more of the file we can sample the better the guess. If you
+		 *      are going to read the whole file into memory at some point, then best to pass
+		 *      in the whole byte byte array directly. Otherwise, decide how to trade off
+		 *      reliability against performance / memory usage.
+		 *
+		 *  - The UTF-8 detection heuristic only works for western text, as it relies on
+		 *      the presence of UTF-8 encoded accented and other characters found in the upper
+		 *      ranges of the Latin-1 and (particularly) Windows-1252 codepages.
+		 *
+		 *  - For more general detection routines, see existing projects / resources:
+		 *    - MLang - Microsoft library originally for IE6, available in Windows XP and later APIs now (I think?)
+		 *      - MLang .Net bindings: http://www.codeproject.com/KB/recipes/DetectEncoding.aspx
+		 *    - CharDet - Mozilla browser's detection routines
+		 *      - Ported to Java then .Net: http://www.conceptdevelopment.net/Localization/NCharDet/
+		 *      - Ported straight to .Net: http://code.google.com/p/chardetsharp/source/browse
+		 *
+		 * Copyright Tao Klerks, Jan 2010, tao@klerks.biz
+		 * Licensed under the modified BSD license:
+		 *
+
+Redistribution and use in source and binary forms, with or without modification, are
+permitted provided that the following conditions are met:
+
+ - Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright notice, this list
+of conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+ - The name of the author may not be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+OF SUCH DAMAGE.
+
+		 *
+		 */
+
+		const long _defaultHeuristicSampleSize = 0x10000; //completely arbitrary - inappropriate for high numbers of files / high speed requirements
+
+		public static Encoding DetectTextFileEncoding(string InputFilename, Encoding DefaultEncoding)
+		{
+			using (FileStream textfileStream = File.OpenRead(InputFilename))
+			{
+				return DetectTextFileEncoding(textfileStream, DefaultEncoding, _defaultHeuristicSampleSize);
+			}
+		}
+
+		public static Encoding DetectTextFileEncoding(FileStream InputFileStream, Encoding DefaultEncoding, long HeuristicSampleSize)
+		{
+			if (InputFileStream == null)
+				throw new ArgumentNullException("Must provide a valid Filestream!", "InputFileStream");
+
+			if (!InputFileStream.CanRead)
+				throw new ArgumentException("Provided file stream is not readable!", "InputFileStream");
+
+			if (!InputFileStream.CanSeek)
+				throw new ArgumentException("Provided file stream cannot seek!", "InputFileStream");
+
+			Encoding encodingFound = null;
+
+			long originalPos = InputFileStream.Position;
+
+			InputFileStream.Position = 0;
+
+
+			//First read only what we need for BOM detection
+
+			byte[] bomBytes = new byte[InputFileStream.Length > 4 ? 4 : InputFileStream.Length];
+			InputFileStream.Read(bomBytes, 0, bomBytes.Length);
+
+			encodingFound = DetectBOMBytes(bomBytes);
+
+			if (encodingFound != null)
+			{
+				InputFileStream.Position = originalPos;
+				return encodingFound;
+			}
+
+
+			//BOM Detection failed, going for heuristics now.
+			//  create sample byte array and populate it
+			byte[] sampleBytes = new byte[HeuristicSampleSize > InputFileStream.Length ? InputFileStream.Length : HeuristicSampleSize];
+			Array.Copy(bomBytes, sampleBytes, bomBytes.Length);
+			if (InputFileStream.Length > bomBytes.Length)
+				InputFileStream.Read(sampleBytes, bomBytes.Length, sampleBytes.Length - bomBytes.Length);
+			InputFileStream.Position = originalPos;
+
+			//test byte array content
+			encodingFound = DetectUnicodeInByteSampleByHeuristics(sampleBytes);
+
+			if (encodingFound != null)
+				return encodingFound;
+			else
+				return DefaultEncoding;
+		}
+
+		public static Encoding DetectTextByteArrayEncoding(byte[] TextData, Encoding DefaultEncoding)
+		{
+			if (TextData == null)
+				throw new ArgumentNullException("Must provide a valid text data byte array!", "TextData");
+
+			Encoding encodingFound = null;
+
+			encodingFound = DetectBOMBytes(TextData);
+
+			if (encodingFound != null)
+			{
+				return encodingFound;
+			}
+			else
+			{
+				//test byte array content
+				encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData);
+
+				if (encodingFound != null)
+					return encodingFound;
+				else
+					return DefaultEncoding;
+			}
+
+
+		}
+
+		public static Encoding DetectBOMBytes(byte[] BOMBytes)
+		{
+			if (BOMBytes == null)
+				throw new ArgumentNullException("Must provide a valid BOM byte array!", "BOMBytes");
+
+			if (BOMBytes.Length < 2)
+				return null;
+
+			if (BOMBytes[0] == 0xff
+				&& BOMBytes[1] == 0xfe
+				&& (BOMBytes.Length < 4
+					|| BOMBytes[2] != 0
+					|| BOMBytes[3] != 0
+					)
+				)
+				return Encoding.Unicode;
+
+			if (BOMBytes[0] == 0xfe
+				&& BOMBytes[1] == 0xff
+				)
+				return Encoding.BigEndianUnicode;
+
+			if (BOMBytes.Length < 3)
+				return null;
+
+			if (BOMBytes[0] == 0xef && BOMBytes[1] == 0xbb && BOMBytes[2] == 0xbf)
+				return Encoding.UTF8;
+
+			if (BOMBytes[0] == 0x2b && BOMBytes[1] == 0x2f && BOMBytes[2] == 0x76)
+				return Encoding.UTF7;
+
+			if (BOMBytes.Length < 4)
+				return null;
+
+			if (BOMBytes[0] == 0xff && BOMBytes[1] == 0xfe && BOMBytes[2] == 0 && BOMBytes[3] == 0)
+				return Encoding.UTF32;
+
+			if (BOMBytes[0] == 0 && BOMBytes[1] == 0 && BOMBytes[2] == 0xfe && BOMBytes[3] == 0xff)
+				return Encoding.GetEncoding(12001);
+
+			return null;
+		}
+
+		public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)
+		{
+			long oddBinaryNullsInSample = 0;
+			long evenBinaryNullsInSample = 0;
+			long suspiciousUTF8SequenceCount = 0;
+			long suspiciousUTF8BytesTotal = 0;
+			long likelyUSASCIIBytesInSample = 0;
+
+			//Cycle through, keeping count of binary null positions, possible UTF-8
+			//  sequences from upper ranges of Windows-1252, and probable US-ASCII
+			//  character counts.
+
+			long currentPos = 0;
+			int skipUTF8Bytes = 0;
+
+			while (currentPos < SampleBytes.Length)
+			{
+				//binary null distribution
+				if (SampleBytes[currentPos] == 0)
+				{
+					if (currentPos % 2 == 0)
+						evenBinaryNullsInSample++;
+					else
+						oddBinaryNullsInSample++;
+				}
+
+				//likely US-ASCII characters
+				if (IsCommonUSASCIIByte(SampleBytes[currentPos]))
+					likelyUSASCIIBytesInSample++;
+
+				//suspicious sequences (look like UTF-8)
+				if (skipUTF8Bytes == 0)
+				{
+					int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);
+
+					if (lengthFound > 0)
+					{
+						suspiciousUTF8SequenceCount++;
+						suspiciousUTF8BytesTotal += lengthFound;
+						skipUTF8Bytes = lengthFound - 1;
+					}
+				}
+				else
+				{
+					skipUTF8Bytes--;
+				}
+
+				currentPos++;
+			}
+
+			//1: UTF-16 LE - in english / european environments, this is usually characterized by a
+			//  high proportion of odd binary nulls (starting at 0), with (as this is text) a low
+			//  proportion of even binary nulls.
+			//  The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
+			//  60% nulls where you do expect nulls) are completely arbitrary.
+
+			if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
+				&& ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
+				)
+				return Encoding.Unicode;
+
+
+			//2: UTF-16 BE - in english / european environments, this is usually characterized by a
+			//  high proportion of even binary nulls (starting at 0), with (as this is text) a low
+			//  proportion of odd binary nulls.
+			//  The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
+			//  60% nulls where you do expect nulls) are completely arbitrary.
+
+			if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
+				&& ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
+				)
+				return Encoding.BigEndianUnicode;
+
+
+			//3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
+			//  using regexp, in his w3c.org unicode FAQ entry:
+			//  http://www.w3.org/International/questions/qa-forms-utf-8
+			//  adapted here for C#.
+			string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
+			Regex UTF8Validator = new Regex(@"\A("
+				+ @"[\x09\x0A\x0D\x20-\x7E]"
+				+ @"|[\xC2-\xDF][\x80-\xBF]"
+				+ @"|\xE0[\xA0-\xBF][\x80-\xBF]"
+				+ @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"
+				+ @"|\xED[\x80-\x9F][\x80-\xBF]"
+				+ @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"
+				+ @"|[\xF1-\xF3][\x80-\xBF]{3}"
+				+ @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"
+				+ @")*\z");
+			if (UTF8Validator.IsMatch(potentiallyMangledString))
+			{
+				//Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
+				//If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
+				//If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
+				// So, we need to play stats.
+
+				// The "Random" likelihood of any pair of randomly generated characters being one
+				//   of these "suspicious" character sequences is:
+				//     128 / (256 * 256) = 0.2%.
+				//
+				// In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
+				//   character range, so we assume that more than 1 in 500,000 of these character
+				//   sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
+				//
+				// We can only assume these character sequences will be rare if we ALSO assume that this
+				//   IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
+				//   not already suspicious sequences) should be plain US-ASCII bytes. This, I
+				//   arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
+				//   approx 40%, so the chances of hitting this threshold by accident in random data are
+				//   VERY low).
+
+				if ((suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1) //suspicious sequences
+					&& (
+					//all suspicious, so cannot evaluate proportion of US-Ascii
+						   SampleBytes.Length - suspiciousUTF8BytesTotal == 0
+						   ||
+						   likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
+					   )
+					)
+					return Encoding.UTF8;
+			}
+
+			return null;
+		}
+
+		private static bool IsCommonUSASCIIByte(byte testByte)
+		{
+			if (testByte == 0x0A //lf
+				|| testByte == 0x0D //cr
+				|| testByte == 0x09 //tab
+				|| (testByte >= 0x20 && testByte <= 0x2F) //common punctuation
+				|| (testByte >= 0x30 && testByte <= 0x39) //digits
+				|| (testByte >= 0x3A && testByte <= 0x40) //common punctuation
+				|| (testByte >= 0x41 && testByte <= 0x5A) //capital letters
+				|| (testByte >= 0x5B && testByte <= 0x60) //common punctuation
+				|| (testByte >= 0x61 && testByte <= 0x7A) //lowercase letters
+				|| (testByte >= 0x7B && testByte <= 0x7E) //common punctuation
+				)
+				return true;
+			else
+				return false;
+		}
+
+		private static int DetectSuspiciousUTF8SequenceLength(byte[] SampleBytes, long currentPos)
+		{
+			int lengthFound = 0;
+
+			if (SampleBytes.Length >= currentPos + 1
+				&& SampleBytes[currentPos] == 0xC2
+				)
+			{
+				if (SampleBytes[currentPos + 1] == 0x81
+					|| SampleBytes[currentPos + 1] == 0x8D
+					|| SampleBytes[currentPos + 1] == 0x8F
+					)
+					lengthFound = 2;
+				else if (SampleBytes[currentPos + 1] == 0x90
+					|| SampleBytes[currentPos + 1] == 0x9D
+					)
+					lengthFound = 2;
+				else if (SampleBytes[currentPos + 1] >= 0xA0
+					&& SampleBytes[currentPos + 1] <= 0xBF
+					)
+					lengthFound = 2;
+			}
+			else if (SampleBytes.Length >= currentPos + 1
+				&& SampleBytes[currentPos] == 0xC3
+				)
+			{
+				if (SampleBytes[currentPos + 1] >= 0x80
+					&& SampleBytes[currentPos + 1] <= 0xBF
+					)
+					lengthFound = 2;
+			}
+			else if (SampleBytes.Length >= currentPos + 1
+				&& SampleBytes[currentPos] == 0xC5
+				)
+			{
+				if (SampleBytes[currentPos + 1] == 0x92
+					|| SampleBytes[currentPos + 1] == 0x93
+					)
+					lengthFound = 2;
+				else if (SampleBytes[currentPos + 1] == 0xA0
+					|| SampleBytes[currentPos + 1] == 0xA1
+					)
+					lengthFound = 2;
+				else if (SampleBytes[currentPos + 1] == 0xB8
+					|| SampleBytes[currentPos + 1] == 0xBD
+					|| SampleBytes[currentPos + 1] == 0xBE
+					)
+					lengthFound = 2;
+			}
+			else if (SampleBytes.Length >= currentPos + 1
+				&& SampleBytes[currentPos] == 0xC6
+				)
+			{
+				if (SampleBytes[currentPos + 1] == 0x92)
+					lengthFound = 2;
+			}
+			else if (SampleBytes.Length >= currentPos + 1
+				&& SampleBytes[currentPos] == 0xCB
+				)
+			{
+				if (SampleBytes[currentPos + 1] == 0x86
+					|| SampleBytes[currentPos + 1] == 0x9C
+					)
+					lengthFound = 2;
+			}
+			else if (SampleBytes.Length >= currentPos + 2
+				&& SampleBytes[currentPos] == 0xE2
+				)
+			{
+				if (SampleBytes[currentPos + 1] == 0x80)
+				{
+					if (SampleBytes[currentPos + 2] == 0x93
+						|| SampleBytes[currentPos + 2] == 0x94
+						)
+						lengthFound = 3;
+					if (SampleBytes[currentPos + 2] == 0x98
+						|| SampleBytes[currentPos + 2] == 0x99
+						|| SampleBytes[currentPos + 2] == 0x9A
+						)
+						lengthFound = 3;
+					if (SampleBytes[currentPos + 2] == 0x9C
+						|| SampleBytes[currentPos + 2] == 0x9D
+						|| SampleBytes[currentPos + 2] == 0x9E
+						)
+						lengthFound = 3;
+					if (SampleBytes[currentPos + 2] == 0xA0
+						|| SampleBytes[currentPos + 2] == 0xA1
+						|| SampleBytes[currentPos + 2] == 0xA2
+						)
+						lengthFound = 3;
+					if (SampleBytes[currentPos + 2] == 0xA6)
+						lengthFound = 3;
+					if (SampleBytes[currentPos + 2] == 0xB0)
+						lengthFound = 3;
+					if (SampleBytes[currentPos + 2] == 0xB9
+						|| SampleBytes[currentPos + 2] == 0xBA
+						)
+						lengthFound = 3;
+				}
+				else if (SampleBytes[currentPos + 1] == 0x82
+					&& SampleBytes[currentPos + 2] == 0xAC
+					)
+					lengthFound = 3;
+				else if (SampleBytes[currentPos + 1] == 0x84
+					&& SampleBytes[currentPos + 2] == 0xA2
+					)
+					lengthFound = 3;
+			}
+
+			return lengthFound;
+		}
+
+	}
+}