jellyfin/Emby.Server.Implementations/TextEncoding/TextEncoding.cs
Erwin de Haan ec1f5dc317 Mayor code cleanup
Add Argument*Exceptions now use proper nameof operators.

Added exception messages to quite a few Argument*Exceptions.

Fixed rethorwing to be proper syntax.

Added a ton of null checkes. (This is only a start, there are about 500 places that need proper null handling)

Added some TODOs to log certain exceptions.

Fix sln again.

Fixed all AssemblyInfo's and added proper copyright (where I could find them)

We live in *current year*.

Fixed the use of braces.

Fixed a ton of properties, and made a fair amount of functions static that should be and can be static.

Made more Methods that should be static static.

You can now use static to find bad functions!

Removed unused variable. And added one more proper XML comment.
2019-01-10 20:38:53 +01:00

271 lines
8.6 KiB
C#

using System;
using System.Text;
using MediaBrowser.Model.IO;
using Microsoft.Extensions.Logging;
using MediaBrowser.Model.Serialization;
using MediaBrowser.Model.Text;
using NLangDetect.Core;
using UniversalDetector;
namespace Emby.Server.Implementations.TextEncoding
{
public class TextEncoding : ITextEncoding
{
private readonly IFileSystem _fileSystem;
private readonly ILogger _logger;
private IJsonSerializer _json;
public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
{
_fileSystem = fileSystem;
_logger = logger;
_json = json;
}
public Encoding GetASCIIEncoding()
{
return Encoding.ASCII;
}
private static Encoding GetInitialEncoding(byte[] buffer, int count)
{
if (count >= 3)
{
if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
return Encoding.UTF8;
}
if (count >= 2)
{
if (buffer[0] == 0xfe && buffer[1] == 0xff)
return Encoding.Unicode;
}
if (count >= 4)
{
if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
return Encoding.UTF32;
}
if (count >= 3)
{
if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
return Encoding.UTF7;
}
var result = new TextEncodingDetect().DetectEncoding(buffer, count);
switch (result)
{
case TextEncodingDetect.CharacterEncoding.Ansi:
return Encoding.ASCII;
case TextEncodingDetect.CharacterEncoding.Ascii:
return Encoding.ASCII;
case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf8Bom:
return Encoding.UTF8;
case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
return Encoding.UTF8;
default:
return null;
}
}
private bool _langDetectInitialized;
public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
{
var index = 0;
var encoding = GetInitialEncoding(bytes, count);
if (encoding != null && encoding.Equals(Encoding.UTF8))
{
return "utf-8";
}
if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
{
if (!_langDetectInitialized)
{
_langDetectInitialized = true;
LanguageDetector.Initialize(_json);
}
language = DetectLanguage(bytes, index, count);
if (!string.IsNullOrWhiteSpace(language))
{
_logger.LogDebug("Text language detected as {0}", language);
}
}
var charset = DetectCharset(bytes, index, count, language);
if (!string.IsNullOrWhiteSpace(charset))
{
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
{
return "utf-8";
}
if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
{
return charset;
}
}
if (!string.IsNullOrWhiteSpace(language))
{
return GetFileCharacterSetFromLanguage(language);
}
return null;
}
private string DetectLanguage(byte[] bytes, int index, int count)
{
try
{
return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
}
catch (NLangDetectException ex)
{
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
}
try
{
return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
}
catch (NLangDetectException ex)
{
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
}
try
{
return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
}
catch (NLangDetectException ex)
{
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
}
return null;
}
public Encoding GetEncodingFromCharset(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
{
throw new ArgumentNullException(nameof(charset));
}
_logger.LogDebug("Getting encoding object for character set: {0}", charset);
try
{
return Encoding.GetEncoding(charset);
}
catch (ArgumentException)
{
charset = charset.Replace("-", string.Empty);
_logger.LogDebug("Getting encoding object for character set: {0}", charset);
return Encoding.GetEncoding(charset);
}
}
public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
{
var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
return GetEncodingFromCharset(charset);
}
private static string GetFileCharacterSetFromLanguage(string language)
{
// https://developer.xamarin.com/api/type/System.Text.Encoding/
switch (language.ToLower())
{
case "tha":
return "windows-874";
case "hun":
return "windows-1252";
case "pol":
case "cze":
case "ces":
case "slo":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rom":
case "rup":
return "windows-1250";
// albanian
case "alb":
case "sqi":
return "windows-1250";
// slovak
case "slk":
case "slv":
return "windows-1250";
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
return "windows-1253";
// greek
case "gre":
case "ell":
return "windows-1253";
case "crh":
case "ota":
case "tur":
return "windows-1254";
// bulgarian
case "bul":
case "bgr":
return "windows-1251";
case "rus":
return "windows-1251";
case "vie":
return "windows-1258";
case "kor":
return "cp949";
default:
return "windows-1252";
}
}
private static string DetectCharset(byte[] bytes, int index, int count, string language)
{
var detector = new CharsetDetector();
detector.Feed(bytes, index, count);
detector.DataEnd();
var charset = detector.Charset;
// This is often incorrectly indetected. If this happens, try to use other techniques instead
if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
{
if (!string.IsNullOrWhiteSpace(language))
{
return null;
}
}
return charset;
}
}
}