Visual Studio Reformat: Emby.Server.Implementations Part T-T

This commit is contained in:
Erwin de Haan 2019-01-13 20:22:56 +01:00
parent 0efc699e3d
commit 25f0315e91
39 changed files with 1054 additions and 892 deletions

View file

@ -1,14 +1,14 @@
using System;
using System.Collections.Generic;
using System.Linq;
using MediaBrowser.Controller.Configuration;
using MediaBrowser.Controller.Dto;
using MediaBrowser.Controller.Entities; using MediaBrowser.Controller.Entities;
using MediaBrowser.Controller.Entities.TV; using MediaBrowser.Controller.Entities.TV;
using MediaBrowser.Controller.Library; using MediaBrowser.Controller.Library;
using MediaBrowser.Controller.TV; using MediaBrowser.Controller.TV;
using MediaBrowser.Model.Entities; using MediaBrowser.Model.Entities;
using MediaBrowser.Model.Querying; using MediaBrowser.Model.Querying;
using System;
using System.Collections.Generic;
using System.Linq;
using MediaBrowser.Controller.Configuration;
using MediaBrowser.Controller.Dto;
namespace Emby.Server.Implementations.TV namespace Emby.Server.Implementations.TV
{ {

View file

@ -1,10 +1,8 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using NLangDetect.Core.Utils;
using MediaBrowser.Model.Serialization;
using System.Linq; using System.Linq;
using MediaBrowser.Model.Serialization;
using NLangDetect.Core.Utils;
namespace NLangDetect.Core namespace NLangDetect.Core
{ {

View file

@ -1,15 +1,15 @@
namespace NLangDetect.Core namespace NLangDetect.Core
{ {
public enum ErrorCode public enum ErrorCode
{ {
NoTextError, NoTextError,
FormatError, FormatError,
FileLoadError, FileLoadError,
DuplicateLangError, DuplicateLangError,
NeedLoadProfileError, NeedLoadProfileError,
CantDetectError, CantDetectError,
CantOpenTrainData, CantOpenTrainData,
TrainDataFormatError, TrainDataFormatError,
InitParamError, InitParamError,
} }
} }

View file

@ -2,13 +2,13 @@
namespace NLangDetect.Core.Extensions namespace NLangDetect.Core.Extensions
{ {
public static class CharExtensions public static class CharExtensions
{ {
private const int MIN_CODE_POINT = 0x000000; private const int MIN_CODE_POINT = 0x000000;
private const int MAX_CODE_POINT = 0x10ffff; private const int MAX_CODE_POINT = 0x10ffff;
private static readonly int[] _unicodeBlockStarts = private static readonly int[] _unicodeBlockStarts =
{ {
#region Unicode block starts #region Unicode block starts
0x0000, // Basic Latin 0x0000, // Basic Latin
@ -165,8 +165,8 @@ namespace NLangDetect.Core.Extensions
#endregion #endregion
}; };
private static readonly UnicodeBlock?[] _unicodeBlocks = private static readonly UnicodeBlock?[] _unicodeBlocks =
{ {
#region Unicode blocks #region Unicode blocks
UnicodeBlock.BasicLatin, UnicodeBlock.BasicLatin,
UnicodeBlock.Latin1Supplement, UnicodeBlock.Latin1Supplement,
@ -322,53 +322,53 @@ namespace NLangDetect.Core.Extensions
#endregion #endregion
}; };
#region Public methods #region Public methods
/// <remarks> /// <remarks>
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
/// </remarks> /// </remarks>
public static UnicodeBlock? GetUnicodeBlock(this char ch) public static UnicodeBlock? GetUnicodeBlock(this char ch)
{
int codePoint = ch;
if (!IsValidCodePoint(codePoint))
{
throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
}
int top, bottom, current;
bottom = 0;
top = _unicodeBlockStarts.Length;
current = top / 2;
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
while (top - bottom > 1)
{
if (codePoint >= _unicodeBlockStarts[current])
{ {
bottom = current; int codePoint = ch;
}
else if (!IsValidCodePoint(codePoint))
{ {
top = current; throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
}
int top, bottom, current;
bottom = 0;
top = _unicodeBlockStarts.Length;
current = top / 2;
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
while (top - bottom > 1)
{
if (codePoint >= _unicodeBlockStarts[current])
{
bottom = current;
}
else
{
top = current;
}
current = (top + bottom) / 2;
}
return _unicodeBlocks[current];
} }
current = (top + bottom) / 2; #endregion
}
return _unicodeBlocks[current]; #region Private helper methods
private static bool IsValidCodePoint(int codePoint)
{
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
}
#endregion
} }
#endregion
#region Private helper methods
private static bool IsValidCodePoint(int codePoint)
{
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
}
#endregion
}
} }

View file

@ -2,50 +2,50 @@
namespace NLangDetect.Core.Extensions namespace NLangDetect.Core.Extensions
{ {
public static class RandomExtensions public static class RandomExtensions
{
private const double _Epsilon = 2.22044604925031E-15;
private static readonly object _mutex = new object();
private static double _nextNextGaussian;
private static bool _hasNextNextGaussian;
/// <summary>
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
/// </summary>
/// <remarks>
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
/// </remarks>
public static double NextGaussian(this Random random)
{ {
lock (_mutex) private const double _Epsilon = 2.22044604925031E-15;
{
if (_hasNextNextGaussian) private static readonly object _mutex = new object();
private static double _nextNextGaussian;
private static bool _hasNextNextGaussian;
/// <summary>
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
/// </summary>
/// <remarks>
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
/// </remarks>
public static double NextGaussian(this Random random)
{ {
_hasNextNextGaussian = false; lock (_mutex)
{
if (_hasNextNextGaussian)
{
_hasNextNextGaussian = false;
return _nextNextGaussian; return _nextNextGaussian;
}
double v1, v2, s;
do
{
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
s = v1 * v1 + v2 * v2;
}
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
_nextNextGaussian = v2 * multiplier;
_hasNextNextGaussian = true;
return v1 * multiplier;
}
} }
double v1, v2, s;
do
{
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
s = v1 * v1 + v2 * v2;
}
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
_nextNextGaussian = v2 * multiplier;
_hasNextNextGaussian = true;
return v1 * multiplier;
}
} }
}
} }

View file

@ -1,131 +1,131 @@
namespace NLangDetect.Core.Extensions namespace NLangDetect.Core.Extensions
{ {
public enum UnicodeBlock public enum UnicodeBlock
{ {
BasicLatin, BasicLatin,
Latin1Supplement, Latin1Supplement,
LatinExtendedA, LatinExtendedA,
LatinExtendedB, LatinExtendedB,
IpaExtensions, IpaExtensions,
SpacingModifierLetters, SpacingModifierLetters,
CombiningDiacriticalMarks, CombiningDiacriticalMarks,
Greek, Greek,
Cyrillic, Cyrillic,
CyrillicSupplementary, CyrillicSupplementary,
Armenian, Armenian,
Hebrew, Hebrew,
Arabic, Arabic,
Syriac, Syriac,
Thaana, Thaana,
Devanagari, Devanagari,
Bengali, Bengali,
Gurmukhi, Gurmukhi,
Gujarati, Gujarati,
Oriya, Oriya,
Tamil, Tamil,
Telugu, Telugu,
Kannada, Kannada,
Malayalam, Malayalam,
Sinhala, Sinhala,
Thai, Thai,
Lao, Lao,
Tibetan, Tibetan,
Myanmar, Myanmar,
Georgian, Georgian,
HangulJamo, HangulJamo,
Ethiopic, Ethiopic,
Cherokee, Cherokee,
UnifiedCanadianAboriginalSyllabics, UnifiedCanadianAboriginalSyllabics,
Ogham, Ogham,
Runic, Runic,
Tagalog, Tagalog,
Hanunoo, Hanunoo,
Buhid, Buhid,
Tagbanwa, Tagbanwa,
Khmer, Khmer,
Mongolian, Mongolian,
Limbu, Limbu,
TaiLe, TaiLe,
KhmerSymbols, KhmerSymbols,
PhoneticExtensions, PhoneticExtensions,
LatinExtendedAdditional, LatinExtendedAdditional,
GreekExtended, GreekExtended,
GeneralPunctuation, GeneralPunctuation,
SuperscriptsAndSubscripts, SuperscriptsAndSubscripts,
CurrencySymbols, CurrencySymbols,
CombiningMarksForSymbols, CombiningMarksForSymbols,
LetterlikeSymbols, LetterlikeSymbols,
NumberForms, NumberForms,
Arrows, Arrows,
MathematicalOperators, MathematicalOperators,
MiscellaneousTechnical, MiscellaneousTechnical,
ControlPictures, ControlPictures,
OpticalCharacterRecognition, OpticalCharacterRecognition,
EnclosedAlphanumerics, EnclosedAlphanumerics,
BoxDrawing, BoxDrawing,
BlockElements, BlockElements,
GeometricShapes, GeometricShapes,
MiscellaneousSymbols, MiscellaneousSymbols,
Dingbats, Dingbats,
MiscellaneousMathematicalSymbolsA, MiscellaneousMathematicalSymbolsA,
SupplementalArrowsA, SupplementalArrowsA,
BraillePatterns, BraillePatterns,
SupplementalArrowsB, SupplementalArrowsB,
MiscellaneousMathematicalSymbolsB, MiscellaneousMathematicalSymbolsB,
SupplementalMathematicalOperators, SupplementalMathematicalOperators,
MiscellaneousSymbolsAndArrows, MiscellaneousSymbolsAndArrows,
CjkRadicalsSupplement, CjkRadicalsSupplement,
KangxiRadicals, KangxiRadicals,
IdeographicDescriptionCharacters, IdeographicDescriptionCharacters,
CjkSymbolsAndPunctuation, CjkSymbolsAndPunctuation,
Hiragana, Hiragana,
Katakana, Katakana,
Bopomofo, Bopomofo,
HangulCompatibilityJamo, HangulCompatibilityJamo,
Kanbun, Kanbun,
BopomofoExtended, BopomofoExtended,
KatakanaPhoneticExtensions, KatakanaPhoneticExtensions,
EnclosedCjkLettersAndMonths, EnclosedCjkLettersAndMonths,
CjkCompatibility, CjkCompatibility,
CjkUnifiedIdeographsExtensionA, CjkUnifiedIdeographsExtensionA,
YijingHexagramSymbols, YijingHexagramSymbols,
CjkUnifiedIdeographs, CjkUnifiedIdeographs,
YiSyllables, YiSyllables,
YiRadicals, YiRadicals,
HangulSyllables, HangulSyllables,
HighSurrogates, HighSurrogates,
HighPrivateUseSurrogates, HighPrivateUseSurrogates,
LowSurrogates, LowSurrogates,
PrivateUseArea, PrivateUseArea,
CjkCompatibilityIdeographs, CjkCompatibilityIdeographs,
AlphabeticPresentationForms, AlphabeticPresentationForms,
ArabicPresentationFormsA, ArabicPresentationFormsA,
VariationSelectors, VariationSelectors,
CombiningHalfMarks, CombiningHalfMarks,
CjkCompatibilityForms, CjkCompatibilityForms,
SmallFormVariants, SmallFormVariants,
ArabicPresentationFormsB, ArabicPresentationFormsB,
HalfwidthAndFullwidthForms, HalfwidthAndFullwidthForms,
Specials, Specials,
LinearBSyllabary, LinearBSyllabary,
LinearBIdeograms, LinearBIdeograms,
AegeanNumbers, AegeanNumbers,
OldItalic, OldItalic,
Gothic, Gothic,
Ugaritic, Ugaritic,
Deseret, Deseret,
Shavian, Shavian,
Osmanya, Osmanya,
CypriotSyllabary, CypriotSyllabary,
ByzantineMusicalSymbols, ByzantineMusicalSymbols,
MusicalSymbols, MusicalSymbols,
TaiXuanJingSymbols, TaiXuanJingSymbols,
MathematicalAlphanumericSymbols, MathematicalAlphanumericSymbols,
CjkUnifiedIdeographsExtensionB, CjkUnifiedIdeographsExtensionB,
CjkCompatibilityIdeographsSupplement, CjkCompatibilityIdeographsSupplement,
Tags, Tags,
VariationSelectorsSupplement, VariationSelectorsSupplement,
SupplementaryPrivateUseAreaA, SupplementaryPrivateUseAreaA,
SupplementaryPrivateUseAreaB, SupplementaryPrivateUseAreaB,
} }
} }

View file

@ -1,67 +1,67 @@
using System; using System;
using System.IO;
using System.IO.Compression; using System.IO.Compression;
using System.Xml; using System.Xml;
using NLangDetect.Core.Utils; using NLangDetect.Core.Utils;
using System.IO;
namespace NLangDetect.Core namespace NLangDetect.Core
{ {
// TODO IMM HI: xml reader not tested // TODO IMM HI: xml reader not tested
public static class GenProfile public static class GenProfile
{
#region Public methods
public static LangProfile load(string lang, string file)
{ {
LangProfile profile = new LangProfile(lang); #region Public methods
TagExtractor tagextractor = new TagExtractor("abstract", 100);
Stream inputStream = null;
try public static LangProfile load(string lang, string file)
{
inputStream = File.OpenRead(file);
string extension = Path.GetExtension(file) ?? "";
if (extension.ToUpper() == ".GZ")
{ {
inputStream = new GZipStream(inputStream, CompressionMode.Decompress); LangProfile profile = new LangProfile(lang);
} TagExtractor tagextractor = new TagExtractor("abstract", 100);
Stream inputStream = null;
using (XmlReader xmlReader = XmlReader.Create(inputStream)) try
{
while (xmlReader.Read())
{
switch (xmlReader.NodeType)
{ {
case XmlNodeType.Element: inputStream = File.OpenRead(file);
tagextractor.SetTag(xmlReader.Name);
break;
case XmlNodeType.Text: string extension = Path.GetExtension(file) ?? "";
tagextractor.Add(xmlReader.Value);
break;
case XmlNodeType.EndElement: if (extension.ToUpper() == ".GZ")
tagextractor.CloseTag(profile); {
break; inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
}
using (XmlReader xmlReader = XmlReader.Create(inputStream))
{
while (xmlReader.Read())
{
switch (xmlReader.NodeType)
{
case XmlNodeType.Element:
tagextractor.SetTag(xmlReader.Name);
break;
case XmlNodeType.Text:
tagextractor.Add(xmlReader.Value);
break;
case XmlNodeType.EndElement:
tagextractor.CloseTag(profile);
break;
}
}
}
}
finally
{
if (inputStream != null)
{
inputStream.Close();
}
} }
}
}
}
finally
{
if (inputStream != null)
{
inputStream.Close();
}
}
Console.WriteLine(lang + ": " + tagextractor.Count); Console.WriteLine(lang + ": " + tagextractor.Count);
return profile; return profile;
}
#endregion
} }
#endregion
}
} }

View file

@ -2,21 +2,21 @@
namespace NLangDetect.Core namespace NLangDetect.Core
{ {
[Serializable] [Serializable]
public class InternalException : Exception public class InternalException : Exception
{
#region Constructor(s)
public InternalException(string message, Exception innerException)
: base(message, innerException)
{ {
} #region Constructor(s)
public InternalException(string message) public InternalException(string message, Exception innerException)
: this(message, null) : base(message, innerException)
{ {
} }
#endregion public InternalException(string message)
} : this(message, null)
{
}
#endregion
}
} }

View file

@ -2,44 +2,44 @@ using System.Globalization;
namespace NLangDetect.Core namespace NLangDetect.Core
{ {
// TODO IMM HI: name?? // TODO IMM HI: name??
public class Language public class Language
{
#region Constructor(s)
public Language(string name, double probability)
{ {
Name = name; #region Constructor(s)
Probability = probability;
public Language(string name, double probability)
{
Name = name;
Probability = probability;
}
#endregion
#region Object overrides
public override string ToString()
{
if (Name == null)
{
return "";
}
return
string.Format(
CultureInfo.InvariantCulture.NumberFormat,
"{0}:{1:0.000000}",
Name,
Probability);
}
#endregion
#region Properties
public string Name { get; set; }
public double Probability { get; set; }
#endregion
} }
#endregion
#region Object overrides
public override string ToString()
{
if (Name == null)
{
return "";
}
return
string.Format(
CultureInfo.InvariantCulture.NumberFormat,
"{0}:{1:0.000000}",
Name,
Probability);
}
#endregion
#region Properties
public string Name { get; set; }
public double Probability { get; set; }
#endregion
}
} }

View file

@ -2,22 +2,22 @@
namespace NLangDetect.Core namespace NLangDetect.Core
{ {
public class NLangDetectException : Exception public class NLangDetectException : Exception
{
#region Constructor(s)
public NLangDetectException(string message, ErrorCode errorCode)
: base(message)
{ {
ErrorCode = errorCode; #region Constructor(s)
public NLangDetectException(string message, ErrorCode errorCode)
: base(message)
{
ErrorCode = errorCode;
}
#endregion
#region Properties
public ErrorCode ErrorCode { get; private set; }
#endregion
} }
#endregion
#region Properties
public ErrorCode ErrorCode { get; private set; }
#endregion
}
} }

View file

@ -3,33 +3,33 @@ using System.Collections.Generic;
namespace NLangDetect.Core namespace NLangDetect.Core
{ {
public class ProbVector public class ProbVector
{
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
public double this[int key]
{ {
get private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
{
double value;
return _dict.TryGetValue(key, out value) ? value : 0.0; public double this[int key]
}
set
{
if (Math.Abs(value) < double.Epsilon)
{ {
if (_dict.ContainsKey(key)) get
{ {
_dict.Remove(key); double value;
}
return; return _dict.TryGetValue(key, out value) ? value : 0.0;
}
set
{
if (Math.Abs(value) < double.Epsilon)
{
if (_dict.ContainsKey(key))
{
_dict.Remove(key);
}
return;
}
_dict[key] = value;
}
} }
_dict[key] = value;
}
} }
}
} }

View file

@ -1,10 +1,9 @@
using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Globalization; using System.Globalization;
using System.IO; using System.IO;
using System.Reflection;
using System.Text.RegularExpressions;
using System.Linq; using System.Linq;
using System; using System.Text.RegularExpressions;
namespace NLangDetect.Core.Utils namespace NLangDetect.Core.Utils
{ {
@ -29,7 +28,7 @@ namespace NLangDetect.Core.Utils
private static Dictionary<string, string> LoadMessages() private static Dictionary<string, string> LoadMessages()
{ {
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1) ; var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1);
Stream messagesStream = Stream messagesStream =
typeof(Messages).Assembly typeof(Messages).Assembly

View file

@ -6,14 +6,14 @@ using NLangDetect.Core.Extensions;
namespace NLangDetect.Core.Utils namespace NLangDetect.Core.Utils
{ {
public class NGram public class NGram
{ {
public const int GramsCount = 3; public const int GramsCount = 3;
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE"); private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
private static readonly string[] CjkClass = private static readonly string[] CjkClass =
{ {
#region CJK classes #region CJK classes
Messages.getString("NGram.KANJI_1_0"), Messages.getString("NGram.KANJI_1_0"),
@ -146,185 +146,185 @@ namespace NLangDetect.Core.Utils
#endregion #endregion
}; };
private static readonly Dictionary<char, char> _cjkMap; private static readonly Dictionary<char, char> _cjkMap;
private StringBuilder _grams; private StringBuilder _grams;
private bool _capitalword; private bool _capitalword;
#region Constructor(s) #region Constructor(s)
static NGram() static NGram()
{
_cjkMap = new Dictionary<char, char>();
foreach (string cjk_list in CjkClass)
{
char representative = cjk_list[0];
for (int i = 0; i < cjk_list.Length; i++)
{ {
_cjkMap.Add(cjk_list[i], representative); _cjkMap = new Dictionary<char, char>();
}
}
}
public NGram() foreach (string cjk_list in CjkClass)
{
_grams = new StringBuilder(" ");
_capitalword = false;
}
#endregion
#region Public methods
public static char Normalize(char ch)
{
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
if (!unicodeBlock.HasValue)
{
return ch;
}
switch (unicodeBlock.Value)
{
case UnicodeBlock.BasicLatin:
{
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
{ {
return ' '; char representative = cjk_list[0];
for (int i = 0; i < cjk_list.Length; i++)
{
_cjkMap.Add(cjk_list[i], representative);
}
} }
break;
}
case UnicodeBlock.Latin1Supplement:
{
if (Latin1Excluded.IndexOf(ch) >= 0)
{
return ' ';
}
break;
}
case UnicodeBlock.GeneralPunctuation:
{
return ' ';
}
case UnicodeBlock.Arabic:
{
if (ch == '\u06cc')
{
return '\u064a';
}
break;
}
case UnicodeBlock.LatinExtendedAdditional:
{
if (ch >= '\u1ea0')
{
return '\u1ec3';
}
break;
}
case UnicodeBlock.Hiragana:
{
return '\u3042';
}
case UnicodeBlock.Katakana:
{
return '\u30a2';
}
case UnicodeBlock.Bopomofo:
case UnicodeBlock.BopomofoExtended:
{
return '\u3105';
}
case UnicodeBlock.CjkUnifiedIdeographs:
{
if (_cjkMap.ContainsKey(ch))
{
return _cjkMap[ch];
}
break;
}
case UnicodeBlock.HangulSyllables:
{
return '\uac00';
}
}
return ch;
}
public void AddChar(char ch)
{
ch = Normalize(ch);
char lastchar = _grams[_grams.Length - 1];
if (lastchar == ' ')
{
_grams = new StringBuilder(" ");
_capitalword = false;
if (ch == ' ') return;
}
else if (_grams.Length >= GramsCount)
{
_grams.Remove(0, 1);
}
_grams.Append(ch);
if (char.IsUpper(ch))
{
if (char.IsUpper(lastchar)) _capitalword = true;
}
else
{
_capitalword = false;
}
}
public string Get(int n)
{
if (_capitalword)
{
return null;
}
int len = _grams.Length;
if (n < 1 || n > 3 || len < n)
{
return null;
}
if (n == 1)
{
char ch = _grams[len - 1];
if (ch == ' ')
{
return null;
} }
return ch.ToString(); public NGram()
} {
_grams = new StringBuilder(" ");
_capitalword = false;
}
// TODO IMM HI: is ToString() here effective? #endregion
return _grams.ToString().SubSequence(len - n, len);
#region Public methods
public static char Normalize(char ch)
{
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
if (!unicodeBlock.HasValue)
{
return ch;
}
switch (unicodeBlock.Value)
{
case UnicodeBlock.BasicLatin:
{
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
{
return ' ';
}
break;
}
case UnicodeBlock.Latin1Supplement:
{
if (Latin1Excluded.IndexOf(ch) >= 0)
{
return ' ';
}
break;
}
case UnicodeBlock.GeneralPunctuation:
{
return ' ';
}
case UnicodeBlock.Arabic:
{
if (ch == '\u06cc')
{
return '\u064a';
}
break;
}
case UnicodeBlock.LatinExtendedAdditional:
{
if (ch >= '\u1ea0')
{
return '\u1ec3';
}
break;
}
case UnicodeBlock.Hiragana:
{
return '\u3042';
}
case UnicodeBlock.Katakana:
{
return '\u30a2';
}
case UnicodeBlock.Bopomofo:
case UnicodeBlock.BopomofoExtended:
{
return '\u3105';
}
case UnicodeBlock.CjkUnifiedIdeographs:
{
if (_cjkMap.ContainsKey(ch))
{
return _cjkMap[ch];
}
break;
}
case UnicodeBlock.HangulSyllables:
{
return '\uac00';
}
}
return ch;
}
public void AddChar(char ch)
{
ch = Normalize(ch);
char lastchar = _grams[_grams.Length - 1];
if (lastchar == ' ')
{
_grams = new StringBuilder(" ");
_capitalword = false;
if (ch == ' ') return;
}
else if (_grams.Length >= GramsCount)
{
_grams.Remove(0, 1);
}
_grams.Append(ch);
if (char.IsUpper(ch))
{
if (char.IsUpper(lastchar)) _capitalword = true;
}
else
{
_capitalword = false;
}
}
public string Get(int n)
{
if (_capitalword)
{
return null;
}
int len = _grams.Length;
if (n < 1 || n > 3 || len < n)
{
return null;
}
if (n == 1)
{
char ch = _grams[len - 1];
if (ch == ' ')
{
return null;
}
return ch.ToString();
}
// TODO IMM HI: is ToString() here effective?
return _grams.ToString().SubSequence(len - n, len);
}
#endregion
} }
#endregion
}
} }

View file

@ -2,75 +2,75 @@ using System.Text;
namespace NLangDetect.Core.Utils namespace NLangDetect.Core.Utils
{ {
public class TagExtractor public class TagExtractor
{
// TODO IMM HI: do the really need to be internal?
internal string Target;
internal int Threshold;
internal StringBuilder StringBuilder;
internal string Tag;
#region Constructor(s)
public TagExtractor(string tag, int threshold)
{ {
Target = tag; // TODO IMM HI: do the really need to be internal?
Threshold = threshold; internal string Target;
Count = 0; internal int Threshold;
Clear(); internal StringBuilder StringBuilder;
} internal string Tag;
#endregion #region Constructor(s)
#region Public methods public TagExtractor(string tag, int threshold)
public void Clear()
{
StringBuilder = new StringBuilder();
Tag = null;
}
public void SetTag(string tag)
{
Tag = tag;
}
public void Add(string line)
{
if (Tag == Target && line != null)
{
StringBuilder.Append(line);
}
}
public void CloseTag(LangProfile profile)
{
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
{
var gram = new NGram();
for (int i = 0; i < StringBuilder.Length; i++)
{ {
gram.AddChar(StringBuilder[i]); Target = tag;
Threshold = threshold;
for (int n = 1; n <= NGram.GramsCount; n++) Count = 0;
{ Clear();
profile.Add(gram.Get(n));
}
} }
Count++; #endregion
}
Clear(); #region Public methods
public void Clear()
{
StringBuilder = new StringBuilder();
Tag = null;
}
public void SetTag(string tag)
{
Tag = tag;
}
public void Add(string line)
{
if (Tag == Target && line != null)
{
StringBuilder.Append(line);
}
}
public void CloseTag(LangProfile profile)
{
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
{
var gram = new NGram();
for (int i = 0; i < StringBuilder.Length; i++)
{
gram.AddChar(StringBuilder[i]);
for (int n = 1; n <= NGram.GramsCount; n++)
{
profile.Add(gram.Get(n));
}
}
Count++;
}
Clear();
}
#endregion
#region Properties
public int Count { get; private set; }
#endregion
} }
#endregion
#region Properties
public int Count { get; private set; }
#endregion
}
} }

View file

@ -1,9 +1,9 @@
using System; using System;
using System.Text; using System.Text;
using MediaBrowser.Model.IO; using MediaBrowser.Model.IO;
using Microsoft.Extensions.Logging;
using MediaBrowser.Model.Serialization; using MediaBrowser.Model.Serialization;
using MediaBrowser.Model.Text; using MediaBrowser.Model.Text;
using Microsoft.Extensions.Logging;
using NLangDetect.Core; using NLangDetect.Core;
using UniversalDetector; using UniversalDetector;

View file

@ -100,7 +100,7 @@ namespace UniversalDetector
this.confidence = 0.0f; this.confidence = 0.0f;
base.Reset(); base.Reset();
} }
public string Charset => charset; public string Charset => charset;
public float Confidence => confidence; public float Confidence => confidence;
@ -109,9 +109,9 @@ namespace UniversalDetector
{ {
this.charset = charset; this.charset = charset;
this.confidence = confidence; this.confidence = confidence;
// if (Finished != null) { // if (Finished != null) {
// Finished(charset, confidence); // Finished(charset, confidence);
// } // }
} }
} }

View file

@ -57,27 +57,34 @@ namespace UniversalDetector.Core
int codingState = 0; int codingState = 0;
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]); codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) { if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
if (codingState == SMModel.ITSME) { if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} }
if (codingState == SMModel.START) { if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen; int charLen = codingSM.CurrentCharLen;
if (i == offset) { if (i == offset)
{
lastChar[1] = buf[offset]; lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen); distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else { }
distributionAnalyser.HandleOneChar(buf, i-1, charLen); else
{
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
} }
} }
} }
lastChar[0] = buf[max-1]; lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting) if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)

View file

@ -40,20 +40,20 @@ namespace UniversalDetector.Core
{ {
public class BitPackage public class BitPackage
{ {
public static int INDEX_SHIFT_4BITS = 3; public static int INDEX_SHIFT_4BITS = 3;
public static int INDEX_SHIFT_8BITS = 2; public static int INDEX_SHIFT_8BITS = 2;
public static int INDEX_SHIFT_16BITS = 1; public static int INDEX_SHIFT_16BITS = 1;
public static int SHIFT_MASK_4BITS = 7; public static int SHIFT_MASK_4BITS = 7;
public static int SHIFT_MASK_8BITS = 3; public static int SHIFT_MASK_8BITS = 3;
public static int SHIFT_MASK_16BITS = 1; public static int SHIFT_MASK_16BITS = 1;
public static int BIT_SHIFT_4BITS = 2; public static int BIT_SHIFT_4BITS = 2;
public static int BIT_SHIFT_8BITS = 3; public static int BIT_SHIFT_8BITS = 3;
public static int BIT_SHIFT_16BITS = 4; public static int BIT_SHIFT_16BITS = 4;
public static int UNIT_MASK_4BITS = 0x0000000F; public static int UNIT_MASK_4BITS = 0x0000000F;
public static int UNIT_MASK_8BITS = 0x000000FF; public static int UNIT_MASK_8BITS = 0x000000FF;
public static int UNIT_MASK_16BITS = 0x0000FFFF; public static int UNIT_MASK_16BITS = 0x0000FFFF;
private int indexShift; private int indexShift;
@ -94,5 +94,5 @@ namespace UniversalDetector.Core
return (data[i >> indexShift] >> return (data[i >> indexShift] >>
((i & shiftMask) << bitShift)) & unitMask; ((i & shiftMask) << bitShift)) & unitMask;
} }
} }
} }

View file

@ -97,9 +97,11 @@ namespace UniversalDetector.Core
{ {
//we only care about 2-bytes character in our distribution analysis //we only care about 2-bytes character in our distribution analysis
int order = (charLen == 2) ? GetOrder(buf, offset) : -1; int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
if (order >= 0) { if (order >= 0)
{
totalChars++; totalChars++;
if (order < tableSize) { // order is valid if (order < tableSize)
{ // order is valid
if (512 > charToFreqOrder[order]) if (512 > charToFreqOrder[order])
freqChars++; freqChars++;
} }
@ -124,7 +126,8 @@ namespace UniversalDetector.Core
// negative answer // negative answer
if (totalChars <= 0 || freqChars <= MINIMUM_DATA_THRESHOLD) if (totalChars <= 0 || freqChars <= MINIMUM_DATA_THRESHOLD)
return SURE_NO; return SURE_NO;
if (totalChars != freqChars) { if (totalChars != freqChars)
{
float r = freqChars / ((totalChars - freqChars) * typicalDistributionRatio); float r = freqChars / ((totalChars - freqChars) * typicalDistributionRatio);
if (r < SURE_YES) if (r < SURE_YES)
return r; return r;
@ -610,8 +613,8 @@ namespace UniversalDetector.Core
/// <returns></returns> /// <returns></returns>
public override int GetOrder(byte[] buf, int offset) public override int GetOrder(byte[] buf, int offset)
{ {
if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1) if (buf[offset] >= 0xB0 && buf[offset + 1] >= 0xA1)
return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1; return 94 * (buf[offset] - 0xb0) + buf[offset + 1] - 0xA1;
else else
return -1; return -1;
} }
@ -1040,7 +1043,7 @@ namespace UniversalDetector.Core
public override int GetOrder(byte[] buf, int offset) public override int GetOrder(byte[] buf, int offset)
{ {
if (buf[offset] >= 0xC4) if (buf[offset] >= 0xC4)
return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1; return 94 * (buf[offset] - 0xC4) + buf[offset + 1] - 0xA1;
else else
return -1; return -1;
} }
@ -1048,7 +1051,7 @@ namespace UniversalDetector.Core
public class EUCKRDistributionAnalyser : CharDistributionAnalyser public class EUCKRDistributionAnalyser : CharDistributionAnalyser
{ {
// Sampling from about 20M text materials include literature and computer technology // Sampling from about 20M text materials include literature and computer technology
/* /*
* 128 --> 0.79 * 128 --> 0.79
* 256 --> 0.92 * 256 --> 0.92
@ -1634,7 +1637,7 @@ namespace UniversalDetector.Core
public override int GetOrder(byte[] buf, int offset) public override int GetOrder(byte[] buf, int offset)
{ {
if (buf[offset] >= 0xB0) if (buf[offset] >= 0xB0)
return 94 * (buf[offset] - 0xB0) + buf[offset+1] - 0xA1; return 94 * (buf[offset] - 0xB0) + buf[offset + 1] - 0xA1;
else else
return -1; return -1;
} }
@ -2559,12 +2562,15 @@ namespace UniversalDetector.Core
/// </summary> /// </summary>
public override int GetOrder(byte[] buf, int offset) public override int GetOrder(byte[] buf, int offset)
{ {
if (buf[offset] >= 0xA4) { if (buf[offset] >= 0xA4)
if (buf[offset+1] >= 0xA1) {
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0xA1 + 63; if (buf[offset + 1] >= 0xA1)
return 157 * (buf[offset] - 0xA4) + buf[offset + 1] - 0xA1 + 63;
else else
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0x40; return 157 * (buf[offset] - 0xA4) + buf[offset + 1] - 0x40;
} else { }
else
{
return -1; return -1;
} }
} }
@ -3140,9 +3146,9 @@ namespace UniversalDetector.Core
order = 188 * (buf[offset] - 0xE0 + 31); order = 188 * (buf[offset] - 0xE0 + 31);
else else
return -1; return -1;
order += buf[offset+1] - 0x40; order += buf[offset + 1] - 0x40;
if (buf[offset+1] > 0x7F) if (buf[offset + 1] > 0x7F)
order--; order--;
return order; return order;
} }
@ -3162,7 +3168,7 @@ namespace UniversalDetector.Core
public override int GetOrder(byte[] buf, int offset) public override int GetOrder(byte[] buf, int offset)
{ {
if (buf[offset] >= 0xA0) if (buf[offset] >= 0xA0)
return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1; return 94 * (buf[offset] - 0xA1) + buf[offset + 1] - 0xA1;
else else
return -1; return -1;
} }

View file

@ -40,7 +40,8 @@ using System.IO;
namespace UniversalDetector.Core namespace UniversalDetector.Core
{ {
public enum ProbingState { public enum ProbingState
{
Detecting = 0, // no sure answer yet, but caller can ask for confidence Detecting = 0, // no sure answer yet, but caller can ask for confidence
FoundIt = 1, // positive answer FoundIt = 1, // positive answer
NotMe = 2 // negative answer NotMe = 2 // negative answer
@ -107,21 +108,27 @@ namespace UniversalDetector.Core
{ {
byte[] result = null; byte[] result = null;
using (MemoryStream ms = new MemoryStream(buf.Length)) { using (MemoryStream ms = new MemoryStream(buf.Length))
{
bool meetMSB = false; bool meetMSB = false;
int max = offset + len; int max = offset + len;
int prev = offset; int prev = offset;
int cur = offset; int cur = offset;
while (cur < max) { while (cur < max)
{
byte b = buf[cur]; byte b = buf[cur];
if ((b & 0x80) != 0) { if ((b & 0x80) != 0)
{
meetMSB = true; meetMSB = true;
} else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A) }
|| b > SMALL_Z) { else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
if (meetMSB && cur > prev) { || b > SMALL_Z)
{
if (meetMSB && cur > prev)
{
ms.Write(buf, prev, cur - prev); ms.Write(buf, prev, cur - prev);
ms.WriteByte(SPACE); ms.WriteByte(SPACE);
meetMSB = false; meetMSB = false;
@ -149,14 +156,16 @@ namespace UniversalDetector.Core
{ {
byte[] result = null; byte[] result = null;
using (MemoryStream ms = new MemoryStream(buf.Length)) { using (MemoryStream ms = new MemoryStream(buf.Length))
{
bool inTag = false; bool inTag = false;
int max = offset + len; int max = offset + len;
int prev = offset; int prev = offset;
int cur = offset; int cur = offset;
while (cur < max) { while (cur < max)
{
byte b = buf[cur]; byte b = buf[cur];
@ -167,8 +176,10 @@ namespace UniversalDetector.Core
// it's ascii, but it's not a letter // it's ascii, but it's not a letter
if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z
|| (b > CAPITAL_Z && b < SMALL_A))) { || (b > CAPITAL_Z && b < SMALL_A)))
if (cur > prev && !inTag) { {
if (cur > prev && !inTag)
{
ms.Write(buf, prev, cur - prev); ms.Write(buf, prev, cur - prev);
ms.WriteByte(SPACE); ms.WriteByte(SPACE);
} }

View file

@ -60,7 +60,8 @@ namespace UniversalDetector.Core
// for each byte we get its class, if it is first byte, // for each byte we get its class, if it is first byte,
// we also get byte length // we also get byte length
int byteCls = model.GetClass(b); int byteCls = model.GetClass(b);
if (currentState == SMModel.START) { if (currentState == SMModel.START)
{
currentBytePos = 0; currentBytePos = 0;
currentCharLen = model.charLenTable[byteCls]; currentCharLen = model.charLenTable[byteCls];
} }

View file

@ -62,29 +62,36 @@ namespace UniversalDetector.Core
int codingState; int codingState;
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]); codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) { if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
if (codingState == SMModel.ITSME) { if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} }
if (codingState == SMModel.START) { if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen; int charLen = codingSM.CurrentCharLen;
if (i == offset) { if (i == offset)
{
lastChar[1] = buf[offset]; lastChar[1] = buf[offset];
contextAnalyser.HandleOneChar(lastChar, 0, charLen); contextAnalyser.HandleOneChar(lastChar, 0, charLen);
distributionAnalyser.HandleOneChar(lastChar, 0, charLen); distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else { }
contextAnalyser.HandleOneChar(buf, i-1, charLen); else
distributionAnalyser.HandleOneChar(buf, i-1, charLen); {
contextAnalyser.HandleOneChar(buf, i - 1, charLen);
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
} }
} }
} }
lastChar[0] = buf[max-1]; lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting) if (state == ProbingState.Detecting)
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt; state = ProbingState.FoundIt;

View file

@ -60,27 +60,34 @@ namespace UniversalDetector.Core
int codingState; int codingState;
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]); codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) { if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
if (codingState == SMModel.ITSME) { if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} }
if (codingState == SMModel.START) { if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen; int charLen = codingSM.CurrentCharLen;
if (i == offset) { if (i == offset)
{
lastChar[1] = buf[offset]; lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen); distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else { }
distributionAnalyser.HandleOneChar(buf, i-1, charLen); else
{
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
} }
} }
} }
lastChar[0] = buf[max-1]; lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting) if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)

View file

@ -56,27 +56,34 @@ namespace UniversalDetector.Core
int codingState; int codingState;
int max = offset + len; int max = offset + len;
for (int i = 0; i < max; i++) { for (int i = 0; i < max; i++)
{
codingState = codingSM.NextState(buf[i]); codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) { if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
if (codingState == SMModel.ITSME) { if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} }
if (codingState == SMModel.START) { if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen; int charLen = codingSM.CurrentCharLen;
if (i == offset) { if (i == offset)
{
lastChar[1] = buf[offset]; lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen); distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else { }
distributionAnalyser.HandleOneChar(buf, i-1, charLen); else
{
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
} }
} }
} }
lastChar[0] = buf[max-1]; lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting) if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)

View file

@ -67,22 +67,30 @@ namespace UniversalDetector.Core
{ {
int max = offset + len; int max = offset + len;
for (int i = offset; i < max && state == ProbingState.Detecting; i++) { for (int i = offset; i < max && state == ProbingState.Detecting; i++)
for (int j = activeSM - 1; j >= 0; j--) { {
for (int j = activeSM - 1; j >= 0; j--)
{
// byte is feed to all active state machine // byte is feed to all active state machine
int codingState = codingSM[j].NextState(buf[i]); int codingState = codingSM[j].NextState(buf[i]);
if (codingState == SMModel.ERROR) { if (codingState == SMModel.ERROR)
{
// got negative answer for this state machine, make it inactive // got negative answer for this state machine, make it inactive
activeSM--; activeSM--;
if (activeSM == 0) { if (activeSM == 0)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
return state; return state;
} else if (j != activeSM) { }
else if (j != activeSM)
{
CodingStateMachine t = codingSM[activeSM]; CodingStateMachine t = codingSM[activeSM];
codingSM[activeSM] = codingSM[j]; codingSM[activeSM] = codingSM[j];
codingSM[j] = t; codingSM[j] = t;
} }
} else if (codingState == SMModel.ITSME) { }
else if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
detectedCharset = codingSM[j].ModelName; detectedCharset = codingSM[j].ModelName;
return state; return state;

View file

@ -87,7 +87,7 @@ namespace UniversalDetector.Core
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
}; };
private readonly static int[] HZCharLenTable = {0, 0, 0, 0, 0, 0}; private readonly static int[] HZCharLenTable = { 0, 0, 0, 0, 0, 0 };
public HZSMModel() : base( public HZSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS, new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
@ -153,7 +153,7 @@ namespace UniversalDetector.Core
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
}; };
private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0}; private readonly static int[] ISO2022CNCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
public ISO2022CNSMModel() : base( public ISO2022CNSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS, new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
@ -220,7 +220,7 @@ namespace UniversalDetector.Core
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47 BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
}; };
private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; private readonly static int[] ISO2022JPCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
public ISO2022JPSMModel() : base( public ISO2022JPSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS, new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
@ -284,7 +284,7 @@ namespace UniversalDetector.Core
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27 BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
}; };
private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0}; private readonly static int[] ISO2022KRCharLenTable = { 0, 0, 0, 0, 0, 0 };
public ISO2022KRSMModel() : base( public ISO2022KRSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS, new BitPackage(BitPackage.INDEX_SHIFT_4BITS,

View file

@ -64,30 +64,38 @@ namespace UniversalDetector.Core
int codingState = SMModel.START; int codingState = SMModel.START;
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]); codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) { if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
if (codingState == SMModel.ITSME) { if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} }
if (codingState == SMModel.START) { if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen; int charLen = codingSM.CurrentCharLen;
if (i == offset) { if (i == offset)
{
lastChar[1] = buf[offset]; lastChar[1] = buf[offset];
analyser.HandleOneChar(lastChar, 0, charLen); analyser.HandleOneChar(lastChar, 0, charLen);
} else { }
analyser.HandleOneChar(buf, i-1, charLen); else
{
analyser.HandleOneChar(buf, i - 1, charLen);
} }
} }
} }
lastChar[0] = buf[max-1]; lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting) { if (state == ProbingState.Detecting)
{
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
} }

View file

@ -36,7 +36,6 @@
* *
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
using System;
/** /**
* General ideas of the Hebrew charset recognition * General ideas of the Hebrew charset recognition
@ -144,11 +143,11 @@ namespace UniversalDetector.Core
public class HebrewProber : CharsetProber public class HebrewProber : CharsetProber
{ {
// windows-1255 / ISO-8859-8 code points of interest // windows-1255 / ISO-8859-8 code points of interest
private const byte FINAL_KAF = 0xEA; private const byte FINAL_KAF = 0xEA;
private const byte NORMAL_KAF = 0xEB; private const byte NORMAL_KAF = 0xEB;
private const byte FINAL_MEM = 0xED; private const byte FINAL_MEM = 0xED;
private const byte NORMAL_MEM = 0xEE; private const byte NORMAL_MEM = 0xEE;
private const byte FINAL_NUN = 0xEF; private const byte FINAL_NUN = 0xEF;
private const byte NORMAL_NUN = 0xF0; private const byte NORMAL_NUN = 0xF0;
private const byte FINAL_PE = 0xF3; private const byte FINAL_PE = 0xF3;
private const byte NORMAL_PE = 0xF4; private const byte NORMAL_PE = 0xF4;
@ -217,14 +216,17 @@ namespace UniversalDetector.Core
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
{
byte b = buf[i]; byte b = buf[i];
// a word just ended // a word just ended
if (b == 0x20) { if (b == 0x20)
{
// *(curPtr-2) was not a space so prev is not a 1 letter word // *(curPtr-2) was not a space so prev is not a 1 letter word
if (beforePrev != 0x20) { if (beforePrev != 0x20)
{
// case (1) [-2:not space][-1:final letter][cur:space] // case (1) [-2:not space][-1:final letter][cur:space]
if (IsFinal(prev)) if (IsFinal(prev))
finalCharLogicalScore++; finalCharLogicalScore++;
@ -233,7 +235,9 @@ namespace UniversalDetector.Core
finalCharVisualScore++; finalCharVisualScore++;
} }
} else { }
else
{
// case (3) [-2:space][-1:final letter][cur:not space] // case (3) [-2:space][-1:final letter][cur:not space]
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' ')) if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
++finalCharVisualScore; ++finalCharVisualScore;

View file

@ -160,7 +160,7 @@ namespace UniversalDetector.Core
{ {
// This is just one way to calculate confidence. It works well for me. // This is just one way to calculate confidence. It works well for me.
if (totalRel > MINIMUM_DATA_THRESHOLD) if (totalRel > MINIMUM_DATA_THRESHOLD)
return ((float)(totalRel - relSample[0]))/totalRel; return ((float)(totalRel - relSample[0])) / totalRel;
else else
return DONT_KNOW; return DONT_KNOW;
} }
@ -181,22 +181,28 @@ namespace UniversalDetector.Core
// to record those bytes as well and analyse the character once it // to record those bytes as well and analyse the character once it
// is complete, but since a character will not make much difference, // is complete, but since a character will not make much difference,
// skipping it will simplify our logic and improve performance. // skipping it will simplify our logic and improve performance.
for (int i = needToSkipCharNum+offset; i < max; ) { for (int i = needToSkipCharNum + offset; i < max;)
{
int order = GetOrder(buf, i, out charLen); int order = GetOrder(buf, i, out charLen);
i += charLen; i += charLen;
if (i > max) { if (i > max)
{
needToSkipCharNum = i - max; needToSkipCharNum = i - max;
lastCharOrder = -1; lastCharOrder = -1;
} else { }
if (order != -1 && lastCharOrder != -1) { else
totalRel ++; {
if (totalRel > MAX_REL_THRESHOLD) { if (order != -1 && lastCharOrder != -1)
{
totalRel++;
if (totalRel > MAX_REL_THRESHOLD)
{
done = true; done = true;
break; break;
} }
relSample[jp2CharContext[lastCharOrder, order]]++; relSample[jp2CharContext[lastCharOrder, order]]++;
} }
lastCharOrder = order; lastCharOrder = order;
} }
} }
} }
@ -210,7 +216,8 @@ namespace UniversalDetector.Core
// Only 2-bytes characters are of our interest // Only 2-bytes characters are of our interest
int order = (charLen == 2) ? GetOrder(buf, offset) : -1; int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
if (order != -1 && lastCharOrder != -1) { if (order != -1 && lastCharOrder != -1)
{
totalRel++; totalRel++;
// count this sequence to its category counter // count this sequence to its category counter
relSample[jp2CharContext[lastCharOrder, order]]++; relSample[jp2CharContext[lastCharOrder, order]]++;
@ -221,7 +228,8 @@ namespace UniversalDetector.Core
public void Reset() public void Reset()
{ {
totalRel = 0; totalRel = 0;
for (int i = 0; i < CATEGORIES_NUM; i++) { for (int i = 0; i < CATEGORIES_NUM; i++)
{
relSample[i] = 0; relSample[i] = 0;
needToSkipCharNum = 0; needToSkipCharNum = 0;
lastCharOrder = -1; lastCharOrder = -1;
@ -254,8 +262,9 @@ namespace UniversalDetector.Core
charLen = 1; charLen = 1;
// return its order if it is hiragana // return its order if it is hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE) { if (buf[offset] == HIRAGANA_FIRST_BYTE)
byte low = buf[offset+1]; {
byte low = buf[offset + 1];
if (low >= 0x9F && low <= 0xF1) if (low >= 0x9F && low <= 0xF1)
return low - 0x9F; return low - 0x9F;
} }
@ -265,8 +274,9 @@ namespace UniversalDetector.Core
protected override int GetOrder(byte[] buf, int offset) protected override int GetOrder(byte[] buf, int offset)
{ {
// We are only interested in Hiragana // We are only interested in Hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE) { if (buf[offset] == HIRAGANA_FIRST_BYTE)
byte low = buf[offset+1]; {
byte low = buf[offset + 1];
if (low >= 0x9F && low <= 0xF1) if (low >= 0x9F && low <= 0xF1)
return low - 0x9F; return low - 0x9F;
} }
@ -292,8 +302,9 @@ namespace UniversalDetector.Core
charLen = 1; charLen = 1;
// return its order if it is hiragana // return its order if it is hiragana
if (high == HIRAGANA_FIRST_BYTE) { if (high == HIRAGANA_FIRST_BYTE)
byte low = buf[offset+1]; {
byte low = buf[offset + 1];
if (low >= 0xA1 && low <= 0xF3) if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1; return low - 0xA1;
} }
@ -303,8 +314,9 @@ namespace UniversalDetector.Core
protected override int GetOrder(byte[] buf, int offset) protected override int GetOrder(byte[] buf, int offset)
{ {
// We are only interested in Hiragana // We are only interested in Hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE) { if (buf[offset] == HIRAGANA_FIRST_BYTE)
byte low = buf[offset+1]; {
byte low = buf[offset + 1];
if (low >= 0xA1 && low <= 0xF3) if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1; return low - 0xA1;
} }

View file

@ -36,7 +36,6 @@
* *
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core namespace UniversalDetector.Core
{ {
@ -135,12 +134,14 @@ namespace UniversalDetector.Core
byte[] newbuf = FilterWithEnglishLetters(buf, offset, len); byte[] newbuf = FilterWithEnglishLetters(buf, offset, len);
byte charClass, freq; byte charClass, freq;
for (int i = 0; i < newbuf.Length; i++) { for (int i = 0; i < newbuf.Length; i++)
{
charClass = Latin1_CharToClass[newbuf[i]]; charClass = Latin1_CharToClass[newbuf[i]];
freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass]; freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass];
if (freq == 0) { if (freq == 0)
state = ProbingState.NotMe; {
break; state = ProbingState.NotMe;
break;
} }
freqCounter[freq]++; freqCounter[freq]++;
lastCharClass = charClass; lastCharClass = charClass;
@ -155,13 +156,17 @@ namespace UniversalDetector.Core
float confidence = 0.0f; float confidence = 0.0f;
int total = 0; int total = 0;
for (int i = 0; i < FREQ_CAT_NUM; i++) { for (int i = 0; i < FREQ_CAT_NUM; i++)
{
total += freqCounter[i]; total += freqCounter[i];
} }
if (total <= 0) { if (total <= 0)
{
confidence = 0.0f; confidence = 0.0f;
} else { }
else
{
confidence = freqCounter[3] * 1.0f / total; confidence = freqCounter[3] * 1.0f / total;
confidence -= freqCounter[1] * 20.0f / total; confidence -= freqCounter[1] * 20.0f / total;
} }

View file

@ -36,7 +36,6 @@
* *
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core namespace UniversalDetector.Core
{ {
@ -67,7 +66,8 @@ namespace UniversalDetector.Core
public override string GetCharsetName() public override string GetCharsetName()
{ {
if (bestGuess == -1) { if (bestGuess == -1)
{
GetConfidence(); GetConfidence();
if (bestGuess == -1) if (bestGuess == -1)
bestGuess = 0; bestGuess = 0;
@ -78,13 +78,17 @@ namespace UniversalDetector.Core
public override void Reset() public override void Reset()
{ {
activeNum = 0; activeNum = 0;
for (int i = 0; i < probers.Length; i++) { for (int i = 0; i < probers.Length; i++)
if (probers[i] != null) { {
probers[i].Reset(); if (probers[i] != null)
isActive[i] = true; {
++activeNum; probers[i].Reset();
} else { isActive[i] = true;
isActive[i] = false; ++activeNum;
}
else
{
isActive[i] = false;
} }
} }
bestGuess = -1; bestGuess = -1;
@ -100,13 +104,18 @@ namespace UniversalDetector.Core
bool keepNext = true; bool keepNext = true;
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
if ((buf[i] & 0x80) != 0) { {
if ((buf[i] & 0x80) != 0)
{
highbyteBuf[hptr++] = buf[i]; highbyteBuf[hptr++] = buf[i];
keepNext = true; keepNext = true;
} else { }
else
{
//if previous is highbyte, keep this even it is a ASCII //if previous is highbyte, keep this even it is a ASCII
if (keepNext) { if (keepNext)
{
highbyteBuf[hptr++] = buf[i]; highbyteBuf[hptr++] = buf[i];
keepNext = false; keepNext = false;
} }
@ -115,18 +124,23 @@ namespace UniversalDetector.Core
ProbingState st = ProbingState.NotMe; ProbingState st = ProbingState.NotMe;
for (int i = 0; i < probers.Length; i++) { for (int i = 0; i < probers.Length; i++)
{
if (!isActive[i]) if (!isActive[i])
continue; continue;
st = probers[i].HandleData(highbyteBuf, 0, hptr); st = probers[i].HandleData(highbyteBuf, 0, hptr);
if (st == ProbingState.FoundIt) { if (st == ProbingState.FoundIt)
{
bestGuess = i; bestGuess = i;
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} else if (st == ProbingState.NotMe) { }
else if (st == ProbingState.NotMe)
{
isActive[i] = false; isActive[i] = false;
activeNum--; activeNum--;
if (activeNum <= 0) { if (activeNum <= 0)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
@ -140,16 +154,23 @@ namespace UniversalDetector.Core
float bestConf = 0.0f; float bestConf = 0.0f;
float cf = 0.0f; float cf = 0.0f;
if (state == ProbingState.FoundIt) { if (state == ProbingState.FoundIt)
{
return 0.99f; return 0.99f;
} else if (state == ProbingState.NotMe) { }
else if (state == ProbingState.NotMe)
{
return 0.01f; return 0.01f;
} else { }
for (int i = 0; i < PROBERS_NUM; i++) { else
{
for (int i = 0; i < PROBERS_NUM; i++)
{
if (!isActive[i]) if (!isActive[i])
continue; continue;
cf = probers[i].GetConfidence(); cf = probers[i].GetConfidence();
if (bestConf < cf) { if (bestConf < cf)
{
bestConf = cf; bestConf = cf;
bestGuess = i; bestGuess = i;
} }
@ -162,10 +183,14 @@ namespace UniversalDetector.Core
{ {
float cf; float cf;
GetConfidence(); GetConfidence();
for (int i = 0; i < PROBERS_NUM; i++) { for (int i = 0; i < PROBERS_NUM; i++)
if (!isActive[i]) { {
if (!isActive[i])
{
//Console.WriteLine(" MBCS inactive: {0} (confidence is too low).", ProberName[i]); //Console.WriteLine(" MBCS inactive: {0} (confidence is too low).", ProberName[i]);
} else { }
else
{
cf = probers[i].GetConfidence(); cf = probers[i].GetConfidence();
//Console.WriteLine(" MBCS {0}: [{1}]", cf, ProberName[i]); //Console.WriteLine(" MBCS {0}: [{1}]", cf, ProberName[i]);
} }

View file

@ -174,7 +174,7 @@ namespace UniversalDetector.Core
// it is used for frequency analysis only, and we are validating // it is used for frequency analysis only, and we are validating
// each code range there as well. So it is safe to set it to be // each code range there as well. So it is safe to set it to be
// 2 here. // 2 here.
private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2}; private readonly static int[] GB18030CharLenTable = { 0, 1, 1, 1, 1, 1, 2 };
public GB18030SMModel() : base( public GB18030SMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS, new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
@ -235,7 +235,7 @@ namespace UniversalDetector.Core
BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17 BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17
}; };
private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0}; private readonly static int[] BIG5CharLenTable = { 0, 1, 1, 2, 0 };
public BIG5SMModel() : base( public BIG5SMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS, new BitPackage(BitPackage.INDEX_SHIFT_4BITS,

View file

@ -36,7 +36,6 @@
* *
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core namespace UniversalDetector.Core
{ {
@ -88,19 +87,24 @@ namespace UniversalDetector.Core
if (newBuf.Length == 0) if (newBuf.Length == 0)
return state; // Nothing to see here, move on. return state; // Nothing to see here, move on.
for (int i = 0; i < PROBERS_NUM; i++) { for (int i = 0; i < PROBERS_NUM; i++)
{
if (!isActive[i]) if (!isActive[i])
continue; continue;
st = probers[i].HandleData(newBuf, 0, newBuf.Length); st = probers[i].HandleData(newBuf, 0, newBuf.Length);
if (st == ProbingState.FoundIt) { if (st == ProbingState.FoundIt)
{
bestGuess = i; bestGuess = i;
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} else if (st == ProbingState.NotMe) { }
else if (st == ProbingState.NotMe)
{
isActive[i] = false; isActive[i] = false;
activeNum--; activeNum--;
if (activeNum <= 0) { if (activeNum <= 0)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
@ -112,24 +116,25 @@ namespace UniversalDetector.Core
public override float GetConfidence() public override float GetConfidence()
{ {
float bestConf = 0.0f, cf; float bestConf = 0.0f, cf;
switch (state) { switch (state)
case ProbingState.FoundIt: {
return 0.99f; //sure yes case ProbingState.FoundIt:
case ProbingState.NotMe: return 0.99f; //sure yes
return 0.01f; //sure no case ProbingState.NotMe:
default: return 0.01f; //sure no
for (int i = 0; i < PROBERS_NUM; i++) default:
{ for (int i = 0; i < PROBERS_NUM; i++)
if (!isActive[i])
continue;
cf = probers[i].GetConfidence();
if (bestConf < cf)
{ {
bestConf = cf; if (!isActive[i])
bestGuess = i; continue;
cf = probers[i].GetConfidence();
if (bestConf < cf)
{
bestConf = cf;
bestGuess = i;
}
} }
} break;
break;
} }
return bestConf; return bestConf;
} }
@ -137,8 +142,9 @@ namespace UniversalDetector.Core
public override void DumpStatus() public override void DumpStatus()
{ {
float cf = GetConfidence(); float cf = GetConfidence();
// Console.WriteLine(" SBCS Group Prober --------begin status"); // Console.WriteLine(" SBCS Group Prober --------begin status");
for (int i = 0; i < PROBERS_NUM; i++) { for (int i = 0; i < PROBERS_NUM; i++)
{
if (isActive[i]) if (isActive[i])
probers[i].DumpStatus(); probers[i].DumpStatus();
//else //else
@ -148,15 +154,19 @@ namespace UniversalDetector.Core
//Console.WriteLine(" SBCS Group found best match [{0}] confidence {1}.", probers[bestGuess].GetCharsetName(), cf); //Console.WriteLine(" SBCS Group found best match [{0}] confidence {1}.", probers[bestGuess].GetCharsetName(), cf);
} }
public override void Reset () public override void Reset()
{ {
int activeNum = 0; int activeNum = 0;
for (int i = 0; i < PROBERS_NUM; i++) { for (int i = 0; i < PROBERS_NUM; i++)
if (probers[i] != null) { {
if (probers[i] != null)
{
probers[i].Reset(); probers[i].Reset();
isActive[i] = true; isActive[i] = true;
activeNum++; activeNum++;
} else { }
else
{
isActive[i] = false; isActive[i] = false;
} }
} }
@ -167,7 +177,8 @@ namespace UniversalDetector.Core
public override string GetCharsetName() public override string GetCharsetName()
{ {
//if we have no answer yet //if we have no answer yet
if (bestGuess == -1) { if (bestGuess == -1)
{
GetConfidence(); GetConfidence();
//no charset seems positive //no charset seems positive
if (bestGuess == -1) if (bestGuess == -1)

View file

@ -36,7 +36,6 @@
* *
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core namespace UniversalDetector.Core
{ {
@ -49,7 +48,7 @@ namespace UniversalDetector.Core
private const float NEGATIVE_SHORTCUT_THRESHOLD = 0.05f; private const float NEGATIVE_SHORTCUT_THRESHOLD = 0.05f;
private const int SYMBOL_CAT_ORDER = 250; private const int SYMBOL_CAT_ORDER = 250;
private const int NUMBER_OF_SEQ_CAT = 4; private const int NUMBER_OF_SEQ_CAT = 4;
private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT-1; private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1;
private const int NEGATIVE_CAT = 0; private const int NEGATIVE_CAT = 0;
protected SequenceModel model; protected SequenceModel model;
@ -89,28 +88,33 @@ namespace UniversalDetector.Core
{ {
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
{
byte order = model.GetOrder(buf[i]); byte order = model.GetOrder(buf[i]);
if (order < SYMBOL_CAT_ORDER) if (order < SYMBOL_CAT_ORDER)
totalChar++; totalChar++;
if (order < SAMPLE_SIZE) { if (order < SAMPLE_SIZE)
{
freqChar++; freqChar++;
if (lastOrder < SAMPLE_SIZE) { if (lastOrder < SAMPLE_SIZE)
{
totalSeqs++; totalSeqs++;
if (!reversed) if (!reversed)
++(seqCounters[model.GetPrecedence(lastOrder*SAMPLE_SIZE+order)]); ++(seqCounters[model.GetPrecedence(lastOrder * SAMPLE_SIZE + order)]);
else // reverse the order of the letters in the lookup else // reverse the order of the letters in the lookup
++(seqCounters[model.GetPrecedence(order*SAMPLE_SIZE+lastOrder)]); ++(seqCounters[model.GetPrecedence(order * SAMPLE_SIZE + lastOrder)]);
} }
} }
lastOrder = order; lastOrder = order;
} }
if (state == ProbingState.Detecting) { if (state == ProbingState.Detecting)
if (totalSeqs > SB_ENOUGH_REL_THRESHOLD) { {
if (totalSeqs > SB_ENOUGH_REL_THRESHOLD)
{
float cf = GetConfidence(); float cf = GetConfidence();
if (cf > POSITIVE_SHORTCUT_THRESHOLD) if (cf > POSITIVE_SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
@ -139,7 +143,8 @@ namespace UniversalDetector.Core
// POSITIVE_APPROACH // POSITIVE_APPROACH
float r = 0.0f; float r = 0.0f;
if (totalSeqs > 0) { if (totalSeqs > 0)
{
r = 1.0f * seqCounters[POSITIVE_CAT] / totalSeqs / model.TypicalPositiveRatio; r = 1.0f * seqCounters[POSITIVE_CAT] / totalSeqs / model.TypicalPositiveRatio;
r = r * freqChar / totalChar; r = r * freqChar / totalChar;
if (r >= 1.0f) if (r >= 1.0f)

View file

@ -69,29 +69,36 @@ namespace UniversalDetector.Core
int codingState; int codingState;
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]); codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) { if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
if (codingState == SMModel.ITSME) { if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} }
if (codingState == SMModel.START) { if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen; int charLen = codingSM.CurrentCharLen;
if (i == offset) { if (i == offset)
{
lastChar[1] = buf[offset]; lastChar[1] = buf[offset];
contextAnalyser.HandleOneChar(lastChar, 2-charLen, charLen); contextAnalyser.HandleOneChar(lastChar, 2 - charLen, charLen);
distributionAnalyser.HandleOneChar(lastChar, 0, charLen); distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
} else { }
contextAnalyser.HandleOneChar(buf, i+1-charLen, charLen); else
distributionAnalyser.HandleOneChar(buf, i-1, charLen); {
contextAnalyser.HandleOneChar(buf, i + 1 - charLen, charLen);
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
} }
} }
} }
lastChar[0] = buf[max-1]; lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting) if (state == ProbingState.Detecting)
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD) if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt; state = ProbingState.FoundIt;

View file

@ -36,7 +36,6 @@
* *
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core namespace UniversalDetector.Core
{ {
@ -54,7 +53,7 @@ namespace UniversalDetector.Core
public int[] charLenTable; public int[] charLenTable;
private string name; private string name;
public string Name => name; public string Name => name;
private int classFactor; private int classFactor;

View file

@ -36,7 +36,6 @@
* *
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
using System;
namespace UniversalDetector.Core namespace UniversalDetector.Core
{ {
@ -51,12 +50,12 @@ namespace UniversalDetector.Core
// freqSeqs / totalSeqs // freqSeqs / totalSeqs
protected float typicalPositiveRatio; protected float typicalPositiveRatio;
public float TypicalPositiveRatio => typicalPositiveRatio; public float TypicalPositiveRatio => typicalPositiveRatio;
// not used // not used
protected bool keepEnglishLetter; protected bool keepEnglishLetter;
public bool KeepEnglishLetter => keepEnglishLetter; public bool KeepEnglishLetter => keepEnglishLetter;
protected string charsetName; protected string charsetName;

View file

@ -51,7 +51,8 @@ namespace UniversalDetector.Core
Reset(); Reset();
} }
public override string GetCharsetName() { public override string GetCharsetName()
{
return "UTF-8"; return "UTF-8";
} }
@ -67,21 +68,25 @@ namespace UniversalDetector.Core
int codingState = SMModel.START; int codingState = SMModel.START;
int max = offset + len; int max = offset + len;
for (int i = offset; i < max; i++) { for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]); codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR) { if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe; state = ProbingState.NotMe;
break; break;
} }
if (codingState == SMModel.ITSME) { if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt; state = ProbingState.FoundIt;
break; break;
} }
if (codingState == SMModel.START) { if (codingState == SMModel.START)
{
if (codingSM.CurrentCharLen >= 2) if (codingSM.CurrentCharLen >= 2)
numOfMBChar++; numOfMBChar++;
} }
@ -98,11 +103,14 @@ namespace UniversalDetector.Core
float unlike = 0.99f; float unlike = 0.99f;
float confidence = 0.0f; float confidence = 0.0f;
if (numOfMBChar < 6) { if (numOfMBChar < 6)
{
for (int i = 0; i < numOfMBChar; i++) for (int i = 0; i < numOfMBChar; i++)
unlike *= ONE_CHAR_PROB; unlike *= ONE_CHAR_PROB;
confidence = 1.0f - unlike; confidence = 1.0f - unlike;
} else { }
else
{
confidence = 0.99f; confidence = 0.99f;
} }
return confidence; return confidence;

View file

@ -39,7 +39,7 @@
namespace UniversalDetector.Core namespace UniversalDetector.Core
{ {
enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 }; enum InputState { PureASCII = 0, EscASCII = 1, Highbyte = 2 };
public abstract class UniversalDetector public abstract class UniversalDetector
{ {
@ -70,7 +70,8 @@ namespace UniversalDetector.Core
protected CharsetProber escCharsetProber; protected CharsetProber escCharsetProber;
protected string detectedCharset; protected string detectedCharset;
public UniversalDetector(int languageFilter) { public UniversalDetector(int languageFilter)
{
this.start = true; this.start = true;
this.inputState = InputState.PureASCII; this.inputState = InputState.PureASCII;
this.lastChar = 0x00; this.lastChar = 0x00;
@ -80,7 +81,8 @@ namespace UniversalDetector.Core
public virtual void Feed(byte[] buf, int offset, int len) public virtual void Feed(byte[] buf, int offset, int len)
{ {
if (done) { if (done)
{
return; return;
} }
@ -88,52 +90,60 @@ namespace UniversalDetector.Core
gotData = true; gotData = true;
// If the data starts with BOM, we know it is UTF // If the data starts with BOM, we know it is UTF
if (start) { if (start)
{
start = false; start = false;
if (len > 3) { if (len > 3)
switch (buf[0]) { {
case 0xEF: switch (buf[0])
if (0xBB == buf[1] && 0xBF == buf[2]) {
detectedCharset = "UTF-8"; case 0xEF:
break; if (0xBB == buf[1] && 0xBF == buf[2])
case 0xFE: detectedCharset = "UTF-8";
if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) break;
// FE FF 00 00 UCS-4, unusual octet order BOM (3412) case 0xFE:
detectedCharset = "X-ISO-10646-UCS-4-3412"; if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
else if (0xFF == buf[1]) // FE FF 00 00 UCS-4, unusual octet order BOM (3412)
detectedCharset = "UTF-16BE"; detectedCharset = "X-ISO-10646-UCS-4-3412";
break; else if (0xFF == buf[1])
case 0x00: detectedCharset = "UTF-16BE";
if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3]) break;
detectedCharset = "UTF-32BE"; case 0x00:
else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3]) if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
// 00 00 FF FE UCS-4, unusual octet order BOM (2143) detectedCharset = "UTF-32BE";
detectedCharset = "X-ISO-10646-UCS-4-2143"; else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
break; // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
case 0xFF: detectedCharset = "X-ISO-10646-UCS-4-2143";
if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) break;
detectedCharset = "UTF-32LE"; case 0xFF:
else if (0xFE == buf[1]) if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
detectedCharset = "UTF-16LE"; detectedCharset = "UTF-32LE";
break; else if (0xFE == buf[1])
detectedCharset = "UTF-16LE";
break;
} // switch } // switch
} }
if (detectedCharset != null) { if (detectedCharset != null)
{
done = true; done = true;
return; return;
} }
} }
for (int i = 0; i < len; i++) { for (int i = 0; i < len; i++)
{
// other than 0xa0, if every other character is ascii, the page is ascii // other than 0xa0, if every other character is ascii, the page is ascii
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
{
// we got a non-ascii byte (high-byte) // we got a non-ascii byte (high-byte)
if (inputState != InputState.Highbyte) { if (inputState != InputState.Highbyte)
{
inputState = InputState.Highbyte; inputState = InputState.Highbyte;
// kill EscCharsetProber if it is active // kill EscCharsetProber if it is active
if (escCharsetProber != null) { if (escCharsetProber != null)
{
escCharsetProber = null; escCharsetProber = null;
} }
@ -145,9 +155,12 @@ namespace UniversalDetector.Core
if (charsetProbers[2] == null) if (charsetProbers[2] == null)
charsetProbers[2] = new Latin1Prober(); charsetProbers[2] = new Latin1Prober();
} }
} else { }
else
{
if (inputState == InputState.PureASCII && if (inputState == InputState.PureASCII &&
(buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) { (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E)))
{
// found escape character or HZ "~{" // found escape character or HZ "~{"
inputState = InputState.EscASCII; inputState = InputState.EscASCII;
} }
@ -157,25 +170,31 @@ namespace UniversalDetector.Core
ProbingState st = ProbingState.NotMe; ProbingState st = ProbingState.NotMe;
switch (inputState) { switch (inputState)
{
case InputState.EscASCII: case InputState.EscASCII:
if (escCharsetProber == null) { if (escCharsetProber == null)
{
escCharsetProber = new EscCharsetProber(); escCharsetProber = new EscCharsetProber();
} }
st = escCharsetProber.HandleData(buf, offset, len); st = escCharsetProber.HandleData(buf, offset, len);
if (st == ProbingState.FoundIt) { if (st == ProbingState.FoundIt)
{
done = true; done = true;
detectedCharset = escCharsetProber.GetCharsetName(); detectedCharset = escCharsetProber.GetCharsetName();
} }
break; break;
case InputState.Highbyte: case InputState.Highbyte:
for (int i = 0; i < PROBERS_NUM; i++) { for (int i = 0; i < PROBERS_NUM; i++)
if (charsetProbers[i] != null) { {
if (charsetProbers[i] != null)
{
st = charsetProbers[i].HandleData(buf, offset, len); st = charsetProbers[i].HandleData(buf, offset, len);
#if DEBUG #if DEBUG
charsetProbers[i].DumpStatus(); charsetProbers[i].DumpStatus();
#endif #endif
if (st == ProbingState.FoundIt) { if (st == ProbingState.FoundIt)
{
done = true; done = true;
detectedCharset = charsetProbers[i].GetCharsetName(); detectedCharset = charsetProbers[i].GetCharsetName();
return; return;
@ -195,38 +214,47 @@ namespace UniversalDetector.Core
/// </summary> /// </summary>
public virtual void DataEnd() public virtual void DataEnd()
{ {
if (!gotData) { if (!gotData)
{
// we haven't got any data yet, return immediately // we haven't got any data yet, return immediately
// caller program sometimes call DataEnd before anything has // caller program sometimes call DataEnd before anything has
// been sent to detector // been sent to detector
return; return;
} }
if (detectedCharset != null) { if (detectedCharset != null)
{
done = true; done = true;
Report(detectedCharset, 1.0f); Report(detectedCharset, 1.0f);
return; return;
} }
if (inputState == InputState.Highbyte) { if (inputState == InputState.Highbyte)
{
float proberConfidence = 0.0f; float proberConfidence = 0.0f;
float maxProberConfidence = 0.0f; float maxProberConfidence = 0.0f;
int maxProber = 0; int maxProber = 0;
for (int i = 0; i < PROBERS_NUM; i++) { for (int i = 0; i < PROBERS_NUM; i++)
if (charsetProbers[i] != null) { {
if (charsetProbers[i] != null)
{
proberConfidence = charsetProbers[i].GetConfidence(); proberConfidence = charsetProbers[i].GetConfidence();
if (proberConfidence > maxProberConfidence) { if (proberConfidence > maxProberConfidence)
{
maxProberConfidence = proberConfidence; maxProberConfidence = proberConfidence;
maxProber = i; maxProber = i;
} }
} }
} }
if (maxProberConfidence > MINIMUM_THRESHOLD) { if (maxProberConfidence > MINIMUM_THRESHOLD)
{
Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence); Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence);
} }
} else if (inputState == InputState.PureASCII) { }
else if (inputState == InputState.PureASCII)
{
Report("ASCII", 1.0f); Report("ASCII", 1.0f);
} }
} }