mirror of
https://github.com/jellyfin/jellyfin.git
synced 2024-07-09 07:10:34 +02:00
Visual Studio Reformat: Emby.Server.Implementations Part T-T
This commit is contained in:
parent
0efc699e3d
commit
25f0315e91
|
@ -1,14 +1,14 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using MediaBrowser.Controller.Configuration;
|
||||
using MediaBrowser.Controller.Dto;
|
||||
using MediaBrowser.Controller.Entities;
|
||||
using MediaBrowser.Controller.Entities.TV;
|
||||
using MediaBrowser.Controller.Library;
|
||||
using MediaBrowser.Controller.TV;
|
||||
using MediaBrowser.Model.Entities;
|
||||
using MediaBrowser.Model.Querying;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using MediaBrowser.Controller.Configuration;
|
||||
using MediaBrowser.Controller.Dto;
|
||||
|
||||
namespace Emby.Server.Implementations.TV
|
||||
{
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.IO.Compression;
|
||||
using NLangDetect.Core.Utils;
|
||||
using MediaBrowser.Model.Serialization;
|
||||
using System.Linq;
|
||||
using MediaBrowser.Model.Serialization;
|
||||
using NLangDetect.Core.Utils;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
using System;
|
||||
using System.IO;
|
||||
using System.IO.Compression;
|
||||
using System.Xml;
|
||||
using NLangDetect.Core.Utils;
|
||||
using System.IO;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using System.IO;
|
||||
using System.Reflection;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Linq;
|
||||
using System;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace NLangDetect.Core.Utils
|
||||
{
|
||||
|
@ -29,7 +28,7 @@ namespace NLangDetect.Core.Utils
|
|||
|
||||
private static Dictionary<string, string> LoadMessages()
|
||||
{
|
||||
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1) ;
|
||||
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1);
|
||||
|
||||
Stream messagesStream =
|
||||
typeof(Messages).Assembly
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
using System;
|
||||
using System.Text;
|
||||
using MediaBrowser.Model.IO;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using MediaBrowser.Model.Serialization;
|
||||
using MediaBrowser.Model.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using NLangDetect.Core;
|
||||
using UniversalDetector;
|
||||
|
||||
|
|
|
@ -109,9 +109,9 @@ namespace UniversalDetector
|
|||
{
|
||||
this.charset = charset;
|
||||
this.confidence = confidence;
|
||||
// if (Finished != null) {
|
||||
// Finished(charset, confidence);
|
||||
// }
|
||||
// if (Finished != null) {
|
||||
// Finished(charset, confidence);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -57,27 +57,34 @@ namespace UniversalDetector.Core
|
|||
int codingState = 0;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
lastChar[0] = buf[max - 1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
|
|
|
@ -97,9 +97,11 @@ namespace UniversalDetector.Core
|
|||
{
|
||||
//we only care about 2-bytes character in our distribution analysis
|
||||
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
||||
if (order >= 0) {
|
||||
if (order >= 0)
|
||||
{
|
||||
totalChars++;
|
||||
if (order < tableSize) { // order is valid
|
||||
if (order < tableSize)
|
||||
{ // order is valid
|
||||
if (512 > charToFreqOrder[order])
|
||||
freqChars++;
|
||||
}
|
||||
|
@ -124,7 +126,8 @@ namespace UniversalDetector.Core
|
|||
// negative answer
|
||||
if (totalChars <= 0 || freqChars <= MINIMUM_DATA_THRESHOLD)
|
||||
return SURE_NO;
|
||||
if (totalChars != freqChars) {
|
||||
if (totalChars != freqChars)
|
||||
{
|
||||
float r = freqChars / ((totalChars - freqChars) * typicalDistributionRatio);
|
||||
if (r < SURE_YES)
|
||||
return r;
|
||||
|
@ -610,8 +613,8 @@ namespace UniversalDetector.Core
|
|||
/// <returns></returns>
|
||||
public override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1)
|
||||
return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1;
|
||||
if (buf[offset] >= 0xB0 && buf[offset + 1] >= 0xA1)
|
||||
return 94 * (buf[offset] - 0xb0) + buf[offset + 1] - 0xA1;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
@ -1040,7 +1043,7 @@ namespace UniversalDetector.Core
|
|||
public override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
if (buf[offset] >= 0xC4)
|
||||
return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1;
|
||||
return 94 * (buf[offset] - 0xC4) + buf[offset + 1] - 0xA1;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
@ -1634,7 +1637,7 @@ namespace UniversalDetector.Core
|
|||
public override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
if (buf[offset] >= 0xB0)
|
||||
return 94 * (buf[offset] - 0xB0) + buf[offset+1] - 0xA1;
|
||||
return 94 * (buf[offset] - 0xB0) + buf[offset + 1] - 0xA1;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
@ -2559,12 +2562,15 @@ namespace UniversalDetector.Core
|
|||
/// </summary>
|
||||
public override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
if (buf[offset] >= 0xA4) {
|
||||
if (buf[offset+1] >= 0xA1)
|
||||
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0xA1 + 63;
|
||||
if (buf[offset] >= 0xA4)
|
||||
{
|
||||
if (buf[offset + 1] >= 0xA1)
|
||||
return 157 * (buf[offset] - 0xA4) + buf[offset + 1] - 0xA1 + 63;
|
||||
else
|
||||
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0x40;
|
||||
} else {
|
||||
return 157 * (buf[offset] - 0xA4) + buf[offset + 1] - 0x40;
|
||||
}
|
||||
else
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
@ -3140,9 +3146,9 @@ namespace UniversalDetector.Core
|
|||
order = 188 * (buf[offset] - 0xE0 + 31);
|
||||
else
|
||||
return -1;
|
||||
order += buf[offset+1] - 0x40;
|
||||
order += buf[offset + 1] - 0x40;
|
||||
|
||||
if (buf[offset+1] > 0x7F)
|
||||
if (buf[offset + 1] > 0x7F)
|
||||
order--;
|
||||
return order;
|
||||
}
|
||||
|
@ -3162,7 +3168,7 @@ namespace UniversalDetector.Core
|
|||
public override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
if (buf[offset] >= 0xA0)
|
||||
return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1;
|
||||
return 94 * (buf[offset] - 0xA1) + buf[offset + 1] - 0xA1;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
|
|
@ -40,7 +40,8 @@ using System.IO;
|
|||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public enum ProbingState {
|
||||
public enum ProbingState
|
||||
{
|
||||
Detecting = 0, // no sure answer yet, but caller can ask for confidence
|
||||
FoundIt = 1, // positive answer
|
||||
NotMe = 2 // negative answer
|
||||
|
@ -107,21 +108,27 @@ namespace UniversalDetector.Core
|
|||
{
|
||||
byte[] result = null;
|
||||
|
||||
using (MemoryStream ms = new MemoryStream(buf.Length)) {
|
||||
using (MemoryStream ms = new MemoryStream(buf.Length))
|
||||
{
|
||||
|
||||
bool meetMSB = false;
|
||||
int max = offset + len;
|
||||
int prev = offset;
|
||||
int cur = offset;
|
||||
|
||||
while (cur < max) {
|
||||
while (cur < max)
|
||||
{
|
||||
byte b = buf[cur];
|
||||
|
||||
if ((b & 0x80) != 0) {
|
||||
if ((b & 0x80) != 0)
|
||||
{
|
||||
meetMSB = true;
|
||||
} else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
|
||||
|| b > SMALL_Z) {
|
||||
if (meetMSB && cur > prev) {
|
||||
}
|
||||
else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
|
||||
|| b > SMALL_Z)
|
||||
{
|
||||
if (meetMSB && cur > prev)
|
||||
{
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.WriteByte(SPACE);
|
||||
meetMSB = false;
|
||||
|
@ -149,14 +156,16 @@ namespace UniversalDetector.Core
|
|||
{
|
||||
byte[] result = null;
|
||||
|
||||
using (MemoryStream ms = new MemoryStream(buf.Length)) {
|
||||
using (MemoryStream ms = new MemoryStream(buf.Length))
|
||||
{
|
||||
|
||||
bool inTag = false;
|
||||
int max = offset + len;
|
||||
int prev = offset;
|
||||
int cur = offset;
|
||||
|
||||
while (cur < max) {
|
||||
while (cur < max)
|
||||
{
|
||||
|
||||
byte b = buf[cur];
|
||||
|
||||
|
@ -167,8 +176,10 @@ namespace UniversalDetector.Core
|
|||
|
||||
// it's ascii, but it's not a letter
|
||||
if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z
|
||||
|| (b > CAPITAL_Z && b < SMALL_A))) {
|
||||
if (cur > prev && !inTag) {
|
||||
|| (b > CAPITAL_Z && b < SMALL_A)))
|
||||
{
|
||||
if (cur > prev && !inTag)
|
||||
{
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.WriteByte(SPACE);
|
||||
}
|
||||
|
|
|
@ -60,7 +60,8 @@ namespace UniversalDetector.Core
|
|||
// for each byte we get its class, if it is first byte,
|
||||
// we also get byte length
|
||||
int byteCls = model.GetClass(b);
|
||||
if (currentState == SMModel.START) {
|
||||
if (currentState == SMModel.START)
|
||||
{
|
||||
currentBytePos = 0;
|
||||
currentCharLen = model.charLenTable[byteCls];
|
||||
}
|
||||
|
|
|
@ -62,29 +62,36 @@ namespace UniversalDetector.Core
|
|||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
contextAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
contextAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
contextAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
lastChar[0] = buf[max - 1];
|
||||
if (state == ProbingState.Detecting)
|
||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
|
|
|
@ -60,27 +60,34 @@ namespace UniversalDetector.Core
|
|||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
lastChar[0] = buf[max - 1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
|
|
|
@ -56,27 +56,34 @@ namespace UniversalDetector.Core
|
|||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = 0; i < max; i++) {
|
||||
for (int i = 0; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
lastChar[0] = buf[max - 1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
|
|
|
@ -67,22 +67,30 @@ namespace UniversalDetector.Core
|
|||
{
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max && state == ProbingState.Detecting; i++) {
|
||||
for (int j = activeSM - 1; j >= 0; j--) {
|
||||
for (int i = offset; i < max && state == ProbingState.Detecting; i++)
|
||||
{
|
||||
for (int j = activeSM - 1; j >= 0; j--)
|
||||
{
|
||||
// byte is feed to all active state machine
|
||||
int codingState = codingSM[j].NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
// got negative answer for this state machine, make it inactive
|
||||
activeSM--;
|
||||
if (activeSM == 0) {
|
||||
if (activeSM == 0)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
return state;
|
||||
} else if (j != activeSM) {
|
||||
}
|
||||
else if (j != activeSM)
|
||||
{
|
||||
CodingStateMachine t = codingSM[activeSM];
|
||||
codingSM[activeSM] = codingSM[j];
|
||||
codingSM[j] = t;
|
||||
}
|
||||
} else if (codingState == SMModel.ITSME) {
|
||||
}
|
||||
else if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
detectedCharset = codingSM[j].ModelName;
|
||||
return state;
|
||||
|
|
|
@ -87,7 +87,7 @@ namespace UniversalDetector.Core
|
|||
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
|
||||
};
|
||||
|
||||
private readonly static int[] HZCharLenTable = {0, 0, 0, 0, 0, 0};
|
||||
private readonly static int[] HZCharLenTable = { 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
public HZSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
|
@ -153,7 +153,7 @@ namespace UniversalDetector.Core
|
|||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
private readonly static int[] ISO2022CNCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
public ISO2022CNSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
|
@ -220,7 +220,7 @@ namespace UniversalDetector.Core
|
|||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
private readonly static int[] ISO2022JPCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
public ISO2022JPSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
|
@ -284,7 +284,7 @@ namespace UniversalDetector.Core
|
|||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0};
|
||||
private readonly static int[] ISO2022KRCharLenTable = { 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
public ISO2022KRSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
|
|
|
@ -64,30 +64,38 @@ namespace UniversalDetector.Core
|
|||
int codingState = SMModel.START;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
analyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
analyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
analyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastChar[0] = buf[max-1];
|
||||
lastChar[0] = buf[max - 1];
|
||||
|
||||
if (state == ProbingState.Detecting) {
|
||||
if (state == ProbingState.Detecting)
|
||||
{
|
||||
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
}
|
||||
|
|
|
@ -36,7 +36,6 @@
|
|||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
/**
|
||||
* General ideas of the Hebrew charset recognition
|
||||
|
@ -217,14 +216,17 @@ namespace UniversalDetector.Core
|
|||
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
|
||||
byte b = buf[i];
|
||||
|
||||
// a word just ended
|
||||
if (b == 0x20) {
|
||||
if (b == 0x20)
|
||||
{
|
||||
// *(curPtr-2) was not a space so prev is not a 1 letter word
|
||||
if (beforePrev != 0x20) {
|
||||
if (beforePrev != 0x20)
|
||||
{
|
||||
// case (1) [-2:not space][-1:final letter][cur:space]
|
||||
if (IsFinal(prev))
|
||||
finalCharLogicalScore++;
|
||||
|
@ -233,7 +235,9 @@ namespace UniversalDetector.Core
|
|||
finalCharVisualScore++;
|
||||
}
|
||||
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
// case (3) [-2:space][-1:final letter][cur:not space]
|
||||
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
|
||||
++finalCharVisualScore;
|
||||
|
|
|
@ -160,7 +160,7 @@ namespace UniversalDetector.Core
|
|||
{
|
||||
// This is just one way to calculate confidence. It works well for me.
|
||||
if (totalRel > MINIMUM_DATA_THRESHOLD)
|
||||
return ((float)(totalRel - relSample[0]))/totalRel;
|
||||
return ((float)(totalRel - relSample[0])) / totalRel;
|
||||
else
|
||||
return DONT_KNOW;
|
||||
}
|
||||
|
@ -181,16 +181,22 @@ namespace UniversalDetector.Core
|
|||
// to record those bytes as well and analyse the character once it
|
||||
// is complete, but since a character will not make much difference,
|
||||
// skipping it will simplify our logic and improve performance.
|
||||
for (int i = needToSkipCharNum+offset; i < max; ) {
|
||||
for (int i = needToSkipCharNum + offset; i < max;)
|
||||
{
|
||||
int order = GetOrder(buf, i, out charLen);
|
||||
i += charLen;
|
||||
if (i > max) {
|
||||
if (i > max)
|
||||
{
|
||||
needToSkipCharNum = i - max;
|
||||
lastCharOrder = -1;
|
||||
} else {
|
||||
if (order != -1 && lastCharOrder != -1) {
|
||||
totalRel ++;
|
||||
if (totalRel > MAX_REL_THRESHOLD) {
|
||||
}
|
||||
else
|
||||
{
|
||||
if (order != -1 && lastCharOrder != -1)
|
||||
{
|
||||
totalRel++;
|
||||
if (totalRel > MAX_REL_THRESHOLD)
|
||||
{
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
|
@ -210,7 +216,8 @@ namespace UniversalDetector.Core
|
|||
|
||||
// Only 2-bytes characters are of our interest
|
||||
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
||||
if (order != -1 && lastCharOrder != -1) {
|
||||
if (order != -1 && lastCharOrder != -1)
|
||||
{
|
||||
totalRel++;
|
||||
// count this sequence to its category counter
|
||||
relSample[jp2CharContext[lastCharOrder, order]]++;
|
||||
|
@ -221,7 +228,8 @@ namespace UniversalDetector.Core
|
|||
public void Reset()
|
||||
{
|
||||
totalRel = 0;
|
||||
for (int i = 0; i < CATEGORIES_NUM; i++) {
|
||||
for (int i = 0; i < CATEGORIES_NUM; i++)
|
||||
{
|
||||
relSample[i] = 0;
|
||||
needToSkipCharNum = 0;
|
||||
lastCharOrder = -1;
|
||||
|
@ -254,8 +262,9 @@ namespace UniversalDetector.Core
|
|||
charLen = 1;
|
||||
|
||||
// return its order if it is hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
||||
byte low = buf[offset+1];
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||
{
|
||||
byte low = buf[offset + 1];
|
||||
if (low >= 0x9F && low <= 0xF1)
|
||||
return low - 0x9F;
|
||||
}
|
||||
|
@ -265,8 +274,9 @@ namespace UniversalDetector.Core
|
|||
protected override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
// We are only interested in Hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
||||
byte low = buf[offset+1];
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||
{
|
||||
byte low = buf[offset + 1];
|
||||
if (low >= 0x9F && low <= 0xF1)
|
||||
return low - 0x9F;
|
||||
}
|
||||
|
@ -292,8 +302,9 @@ namespace UniversalDetector.Core
|
|||
charLen = 1;
|
||||
|
||||
// return its order if it is hiragana
|
||||
if (high == HIRAGANA_FIRST_BYTE) {
|
||||
byte low = buf[offset+1];
|
||||
if (high == HIRAGANA_FIRST_BYTE)
|
||||
{
|
||||
byte low = buf[offset + 1];
|
||||
if (low >= 0xA1 && low <= 0xF3)
|
||||
return low - 0xA1;
|
||||
}
|
||||
|
@ -303,8 +314,9 @@ namespace UniversalDetector.Core
|
|||
protected override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
// We are only interested in Hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
||||
byte low = buf[offset+1];
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||
{
|
||||
byte low = buf[offset + 1];
|
||||
if (low >= 0xA1 && low <= 0xF3)
|
||||
return low - 0xA1;
|
||||
}
|
||||
|
|
|
@ -36,7 +36,6 @@
|
|||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
@ -135,10 +134,12 @@ namespace UniversalDetector.Core
|
|||
byte[] newbuf = FilterWithEnglishLetters(buf, offset, len);
|
||||
byte charClass, freq;
|
||||
|
||||
for (int i = 0; i < newbuf.Length; i++) {
|
||||
for (int i = 0; i < newbuf.Length; i++)
|
||||
{
|
||||
charClass = Latin1_CharToClass[newbuf[i]];
|
||||
freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass];
|
||||
if (freq == 0) {
|
||||
if (freq == 0)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
|
@ -155,13 +156,17 @@ namespace UniversalDetector.Core
|
|||
|
||||
float confidence = 0.0f;
|
||||
int total = 0;
|
||||
for (int i = 0; i < FREQ_CAT_NUM; i++) {
|
||||
for (int i = 0; i < FREQ_CAT_NUM; i++)
|
||||
{
|
||||
total += freqCounter[i];
|
||||
}
|
||||
|
||||
if (total <= 0) {
|
||||
if (total <= 0)
|
||||
{
|
||||
confidence = 0.0f;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
confidence = freqCounter[3] * 1.0f / total;
|
||||
confidence -= freqCounter[1] * 20.0f / total;
|
||||
}
|
||||
|
|
|
@ -36,7 +36,6 @@
|
|||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
@ -67,7 +66,8 @@ namespace UniversalDetector.Core
|
|||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
if (bestGuess == -1) {
|
||||
if (bestGuess == -1)
|
||||
{
|
||||
GetConfidence();
|
||||
if (bestGuess == -1)
|
||||
bestGuess = 0;
|
||||
|
@ -78,12 +78,16 @@ namespace UniversalDetector.Core
|
|||
public override void Reset()
|
||||
{
|
||||
activeNum = 0;
|
||||
for (int i = 0; i < probers.Length; i++) {
|
||||
if (probers[i] != null) {
|
||||
for (int i = 0; i < probers.Length; i++)
|
||||
{
|
||||
if (probers[i] != null)
|
||||
{
|
||||
probers[i].Reset();
|
||||
isActive[i] = true;
|
||||
++activeNum;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
isActive[i] = false;
|
||||
}
|
||||
}
|
||||
|
@ -100,13 +104,18 @@ namespace UniversalDetector.Core
|
|||
bool keepNext = true;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
if ((buf[i] & 0x80) != 0) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
if ((buf[i] & 0x80) != 0)
|
||||
{
|
||||
highbyteBuf[hptr++] = buf[i];
|
||||
keepNext = true;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
//if previous is highbyte, keep this even it is a ASCII
|
||||
if (keepNext) {
|
||||
if (keepNext)
|
||||
{
|
||||
highbyteBuf[hptr++] = buf[i];
|
||||
keepNext = false;
|
||||
}
|
||||
|
@ -115,18 +124,23 @@ namespace UniversalDetector.Core
|
|||
|
||||
ProbingState st = ProbingState.NotMe;
|
||||
|
||||
for (int i = 0; i < probers.Length; i++) {
|
||||
for (int i = 0; i < probers.Length; i++)
|
||||
{
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
st = probers[i].HandleData(highbyteBuf, 0, hptr);
|
||||
if (st == ProbingState.FoundIt) {
|
||||
if (st == ProbingState.FoundIt)
|
||||
{
|
||||
bestGuess = i;
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
} else if (st == ProbingState.NotMe) {
|
||||
}
|
||||
else if (st == ProbingState.NotMe)
|
||||
{
|
||||
isActive[i] = false;
|
||||
activeNum--;
|
||||
if (activeNum <= 0) {
|
||||
if (activeNum <= 0)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
|
@ -140,16 +154,23 @@ namespace UniversalDetector.Core
|
|||
float bestConf = 0.0f;
|
||||
float cf = 0.0f;
|
||||
|
||||
if (state == ProbingState.FoundIt) {
|
||||
if (state == ProbingState.FoundIt)
|
||||
{
|
||||
return 0.99f;
|
||||
} else if (state == ProbingState.NotMe) {
|
||||
}
|
||||
else if (state == ProbingState.NotMe)
|
||||
{
|
||||
return 0.01f;
|
||||
} else {
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
{
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
cf = probers[i].GetConfidence();
|
||||
if (bestConf < cf) {
|
||||
if (bestConf < cf)
|
||||
{
|
||||
bestConf = cf;
|
||||
bestGuess = i;
|
||||
}
|
||||
|
@ -162,10 +183,14 @@ namespace UniversalDetector.Core
|
|||
{
|
||||
float cf;
|
||||
GetConfidence();
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (!isActive[i]) {
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
{
|
||||
if (!isActive[i])
|
||||
{
|
||||
//Console.WriteLine(" MBCS inactive: {0} (confidence is too low).", ProberName[i]);
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
cf = probers[i].GetConfidence();
|
||||
//Console.WriteLine(" MBCS {0}: [{1}]", cf, ProberName[i]);
|
||||
}
|
||||
|
|
|
@ -174,7 +174,7 @@ namespace UniversalDetector.Core
|
|||
// it is used for frequency analysis only, and we are validating
|
||||
// each code range there as well. So it is safe to set it to be
|
||||
// 2 here.
|
||||
private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2};
|
||||
private readonly static int[] GB18030CharLenTable = { 0, 1, 1, 1, 1, 1, 2 };
|
||||
|
||||
public GB18030SMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
|
@ -235,7 +235,7 @@ namespace UniversalDetector.Core
|
|||
BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17
|
||||
};
|
||||
|
||||
private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0};
|
||||
private readonly static int[] BIG5CharLenTable = { 0, 1, 1, 2, 0 };
|
||||
|
||||
public BIG5SMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
|
|
|
@ -36,7 +36,6 @@
|
|||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
@ -88,19 +87,24 @@ namespace UniversalDetector.Core
|
|||
if (newBuf.Length == 0)
|
||||
return state; // Nothing to see here, move on.
|
||||
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
{
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
st = probers[i].HandleData(newBuf, 0, newBuf.Length);
|
||||
|
||||
if (st == ProbingState.FoundIt) {
|
||||
if (st == ProbingState.FoundIt)
|
||||
{
|
||||
bestGuess = i;
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
} else if (st == ProbingState.NotMe) {
|
||||
}
|
||||
else if (st == ProbingState.NotMe)
|
||||
{
|
||||
isActive[i] = false;
|
||||
activeNum--;
|
||||
if (activeNum <= 0) {
|
||||
if (activeNum <= 0)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
|
@ -112,7 +116,8 @@ namespace UniversalDetector.Core
|
|||
public override float GetConfidence()
|
||||
{
|
||||
float bestConf = 0.0f, cf;
|
||||
switch (state) {
|
||||
switch (state)
|
||||
{
|
||||
case ProbingState.FoundIt:
|
||||
return 0.99f; //sure yes
|
||||
case ProbingState.NotMe:
|
||||
|
@ -138,7 +143,8 @@ namespace UniversalDetector.Core
|
|||
{
|
||||
float cf = GetConfidence();
|
||||
// Console.WriteLine(" SBCS Group Prober --------begin status");
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
{
|
||||
if (isActive[i])
|
||||
probers[i].DumpStatus();
|
||||
//else
|
||||
|
@ -148,15 +154,19 @@ namespace UniversalDetector.Core
|
|||
//Console.WriteLine(" SBCS Group found best match [{0}] confidence {1}.", probers[bestGuess].GetCharsetName(), cf);
|
||||
}
|
||||
|
||||
public override void Reset ()
|
||||
public override void Reset()
|
||||
{
|
||||
int activeNum = 0;
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (probers[i] != null) {
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
{
|
||||
if (probers[i] != null)
|
||||
{
|
||||
probers[i].Reset();
|
||||
isActive[i] = true;
|
||||
activeNum++;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
isActive[i] = false;
|
||||
}
|
||||
}
|
||||
|
@ -167,7 +177,8 @@ namespace UniversalDetector.Core
|
|||
public override string GetCharsetName()
|
||||
{
|
||||
//if we have no answer yet
|
||||
if (bestGuess == -1) {
|
||||
if (bestGuess == -1)
|
||||
{
|
||||
GetConfidence();
|
||||
//no charset seems positive
|
||||
if (bestGuess == -1)
|
||||
|
|
|
@ -36,7 +36,6 @@
|
|||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
@ -49,7 +48,7 @@ namespace UniversalDetector.Core
|
|||
private const float NEGATIVE_SHORTCUT_THRESHOLD = 0.05f;
|
||||
private const int SYMBOL_CAT_ORDER = 250;
|
||||
private const int NUMBER_OF_SEQ_CAT = 4;
|
||||
private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT-1;
|
||||
private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1;
|
||||
private const int NEGATIVE_CAT = 0;
|
||||
|
||||
protected SequenceModel model;
|
||||
|
@ -89,28 +88,33 @@ namespace UniversalDetector.Core
|
|||
{
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
byte order = model.GetOrder(buf[i]);
|
||||
|
||||
if (order < SYMBOL_CAT_ORDER)
|
||||
totalChar++;
|
||||
|
||||
if (order < SAMPLE_SIZE) {
|
||||
if (order < SAMPLE_SIZE)
|
||||
{
|
||||
freqChar++;
|
||||
|
||||
if (lastOrder < SAMPLE_SIZE) {
|
||||
if (lastOrder < SAMPLE_SIZE)
|
||||
{
|
||||
totalSeqs++;
|
||||
if (!reversed)
|
||||
++(seqCounters[model.GetPrecedence(lastOrder*SAMPLE_SIZE+order)]);
|
||||
++(seqCounters[model.GetPrecedence(lastOrder * SAMPLE_SIZE + order)]);
|
||||
else // reverse the order of the letters in the lookup
|
||||
++(seqCounters[model.GetPrecedence(order*SAMPLE_SIZE+lastOrder)]);
|
||||
++(seqCounters[model.GetPrecedence(order * SAMPLE_SIZE + lastOrder)]);
|
||||
}
|
||||
}
|
||||
lastOrder = order;
|
||||
}
|
||||
|
||||
if (state == ProbingState.Detecting) {
|
||||
if (totalSeqs > SB_ENOUGH_REL_THRESHOLD) {
|
||||
if (state == ProbingState.Detecting)
|
||||
{
|
||||
if (totalSeqs > SB_ENOUGH_REL_THRESHOLD)
|
||||
{
|
||||
float cf = GetConfidence();
|
||||
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
|
@ -139,7 +143,8 @@ namespace UniversalDetector.Core
|
|||
// POSITIVE_APPROACH
|
||||
float r = 0.0f;
|
||||
|
||||
if (totalSeqs > 0) {
|
||||
if (totalSeqs > 0)
|
||||
{
|
||||
r = 1.0f * seqCounters[POSITIVE_CAT] / totalSeqs / model.TypicalPositiveRatio;
|
||||
r = r * freqChar / totalChar;
|
||||
if (r >= 1.0f)
|
||||
|
|
|
@ -69,29 +69,36 @@ namespace UniversalDetector.Core
|
|||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
contextAnalyser.HandleOneChar(lastChar, 2-charLen, charLen);
|
||||
contextAnalyser.HandleOneChar(lastChar, 2 - charLen, charLen);
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
contextAnalyser.HandleOneChar(buf, i+1-charLen, charLen);
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
contextAnalyser.HandleOneChar(buf, i + 1 - charLen, charLen);
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
lastChar[0] = buf[max - 1];
|
||||
if (state == ProbingState.Detecting)
|
||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
|
|
|
@ -36,7 +36,6 @@
|
|||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
|
|
@ -36,7 +36,6 @@
|
|||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
|
|
@ -51,7 +51,8 @@ namespace UniversalDetector.Core
|
|||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName() {
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "UTF-8";
|
||||
}
|
||||
|
||||
|
@ -67,21 +68,25 @@ namespace UniversalDetector.Core
|
|||
int codingState = SMModel.START;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
|
||||
if (codingState == SMModel.ERROR) {
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
|
||||
if (codingState == SMModel.ITSME) {
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
|
||||
if (codingState == SMModel.START) {
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
if (codingSM.CurrentCharLen >= 2)
|
||||
numOfMBChar++;
|
||||
}
|
||||
|
@ -98,11 +103,14 @@ namespace UniversalDetector.Core
|
|||
float unlike = 0.99f;
|
||||
float confidence = 0.0f;
|
||||
|
||||
if (numOfMBChar < 6) {
|
||||
if (numOfMBChar < 6)
|
||||
{
|
||||
for (int i = 0; i < numOfMBChar; i++)
|
||||
unlike *= ONE_CHAR_PROB;
|
||||
confidence = 1.0f - unlike;
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
confidence = 0.99f;
|
||||
}
|
||||
return confidence;
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
||||
enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 };
|
||||
enum InputState { PureASCII = 0, EscASCII = 1, Highbyte = 2 };
|
||||
|
||||
public abstract class UniversalDetector
|
||||
{
|
||||
|
@ -70,7 +70,8 @@ namespace UniversalDetector.Core
|
|||
protected CharsetProber escCharsetProber;
|
||||
protected string detectedCharset;
|
||||
|
||||
public UniversalDetector(int languageFilter) {
|
||||
public UniversalDetector(int languageFilter)
|
||||
{
|
||||
this.start = true;
|
||||
this.inputState = InputState.PureASCII;
|
||||
this.lastChar = 0x00;
|
||||
|
@ -80,7 +81,8 @@ namespace UniversalDetector.Core
|
|||
|
||||
public virtual void Feed(byte[] buf, int offset, int len)
|
||||
{
|
||||
if (done) {
|
||||
if (done)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -88,10 +90,13 @@ namespace UniversalDetector.Core
|
|||
gotData = true;
|
||||
|
||||
// If the data starts with BOM, we know it is UTF
|
||||
if (start) {
|
||||
if (start)
|
||||
{
|
||||
start = false;
|
||||
if (len > 3) {
|
||||
switch (buf[0]) {
|
||||
if (len > 3)
|
||||
{
|
||||
switch (buf[0])
|
||||
{
|
||||
case 0xEF:
|
||||
if (0xBB == buf[1] && 0xBF == buf[2])
|
||||
detectedCharset = "UTF-8";
|
||||
|
@ -118,22 +123,27 @@ namespace UniversalDetector.Core
|
|||
break;
|
||||
} // switch
|
||||
}
|
||||
if (detectedCharset != null) {
|
||||
if (detectedCharset != null)
|
||||
{
|
||||
done = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
for (int i = 0; i < len; i++)
|
||||
{
|
||||
|
||||
// other than 0xa0, if every other character is ascii, the page is ascii
|
||||
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) {
|
||||
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
|
||||
{
|
||||
// we got a non-ascii byte (high-byte)
|
||||
if (inputState != InputState.Highbyte) {
|
||||
if (inputState != InputState.Highbyte)
|
||||
{
|
||||
inputState = InputState.Highbyte;
|
||||
|
||||
// kill EscCharsetProber if it is active
|
||||
if (escCharsetProber != null) {
|
||||
if (escCharsetProber != null)
|
||||
{
|
||||
escCharsetProber = null;
|
||||
}
|
||||
|
||||
|
@ -145,9 +155,12 @@ namespace UniversalDetector.Core
|
|||
if (charsetProbers[2] == null)
|
||||
charsetProbers[2] = new Latin1Prober();
|
||||
}
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
if (inputState == InputState.PureASCII &&
|
||||
(buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) {
|
||||
(buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E)))
|
||||
{
|
||||
// found escape character or HZ "~{"
|
||||
inputState = InputState.EscASCII;
|
||||
}
|
||||
|
@ -157,25 +170,31 @@ namespace UniversalDetector.Core
|
|||
|
||||
ProbingState st = ProbingState.NotMe;
|
||||
|
||||
switch (inputState) {
|
||||
switch (inputState)
|
||||
{
|
||||
case InputState.EscASCII:
|
||||
if (escCharsetProber == null) {
|
||||
if (escCharsetProber == null)
|
||||
{
|
||||
escCharsetProber = new EscCharsetProber();
|
||||
}
|
||||
st = escCharsetProber.HandleData(buf, offset, len);
|
||||
if (st == ProbingState.FoundIt) {
|
||||
if (st == ProbingState.FoundIt)
|
||||
{
|
||||
done = true;
|
||||
detectedCharset = escCharsetProber.GetCharsetName();
|
||||
}
|
||||
break;
|
||||
case InputState.Highbyte:
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (charsetProbers[i] != null) {
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
{
|
||||
if (charsetProbers[i] != null)
|
||||
{
|
||||
st = charsetProbers[i].HandleData(buf, offset, len);
|
||||
#if DEBUG
|
||||
#if DEBUG
|
||||
charsetProbers[i].DumpStatus();
|
||||
#endif
|
||||
if (st == ProbingState.FoundIt) {
|
||||
#endif
|
||||
if (st == ProbingState.FoundIt)
|
||||
{
|
||||
done = true;
|
||||
detectedCharset = charsetProbers[i].GetCharsetName();
|
||||
return;
|
||||
|
@ -195,38 +214,47 @@ namespace UniversalDetector.Core
|
|||
/// </summary>
|
||||
public virtual void DataEnd()
|
||||
{
|
||||
if (!gotData) {
|
||||
if (!gotData)
|
||||
{
|
||||
// we haven't got any data yet, return immediately
|
||||
// caller program sometimes call DataEnd before anything has
|
||||
// been sent to detector
|
||||
return;
|
||||
}
|
||||
|
||||
if (detectedCharset != null) {
|
||||
if (detectedCharset != null)
|
||||
{
|
||||
done = true;
|
||||
Report(detectedCharset, 1.0f);
|
||||
return;
|
||||
}
|
||||
|
||||
if (inputState == InputState.Highbyte) {
|
||||
if (inputState == InputState.Highbyte)
|
||||
{
|
||||
float proberConfidence = 0.0f;
|
||||
float maxProberConfidence = 0.0f;
|
||||
int maxProber = 0;
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (charsetProbers[i] != null) {
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
{
|
||||
if (charsetProbers[i] != null)
|
||||
{
|
||||
proberConfidence = charsetProbers[i].GetConfidence();
|
||||
if (proberConfidence > maxProberConfidence) {
|
||||
if (proberConfidence > maxProberConfidence)
|
||||
{
|
||||
maxProberConfidence = proberConfidence;
|
||||
maxProber = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (maxProberConfidence > MINIMUM_THRESHOLD) {
|
||||
if (maxProberConfidence > MINIMUM_THRESHOLD)
|
||||
{
|
||||
Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence);
|
||||
}
|
||||
|
||||
} else if (inputState == InputState.PureASCII) {
|
||||
}
|
||||
else if (inputState == InputState.PureASCII)
|
||||
{
|
||||
Report("ASCII", 1.0f);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue