class CharDet::JapaneseContextAnalysis
Public Class Methods
new()
click to toggle source
# File lib/rchardet/jpcntx.rb, line 123 def initialize reset() end
Public Instance Methods
feed(aBuf, aLen)
click to toggle source
# File lib/rchardet/jpcntx.rb, line 135 def feed(aBuf, aLen) return if @done # The buffer we got is byte oriented, and a character may span in more than one # buffers. In case the last one or two byte in last buffer is not complete, we # record how many byte needed to complete that character and skip these bytes here. # We can choose to record those bytes as well and analyse the character once it # is complete, but since a character will not make much difference, by simply skipping # this character will simply our logic and improve performance. i = @needToSkipCharNum while i < aLen order, charLen = get_order(aBuf[i, 2]) i += charLen if i > aLen @needToSkipCharNum = i - aLen @lastCharOrder = -1 else if (order != -1) and (@lastCharOrder != -1) @totalRel += 1 if @totalRel > MAX_REL_THRESHOLD @done = true break end @relSample[JP2_CHAR_CONTEXT[@lastCharOrder][order]] += 1 end @lastCharOrder = order end end end
get_confidence()
click to toggle source
# File lib/rchardet/jpcntx.rb, line 169 def get_confidence # This is just one way to calculate confidence. It works well for me. if @totalRel > MINIMUM_DATA_THRESHOLD return (@totalRel - @relSample[0]) / @totalRel else return DONT_KNOW end end
get_order(aStr)
click to toggle source
# File lib/rchardet/jpcntx.rb, line 178 def get_order(aStr) return -1, 1 end
got_enough_data()
click to toggle source
# File lib/rchardet/jpcntx.rb, line 165 def got_enough_data return @totalRel > ENOUGH_REL_THRESHOLD end
reset()
click to toggle source
# File lib/rchardet/jpcntx.rb, line 127 def reset @totalRel = 0 # total sequence received @relSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category @needToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer @lastCharOrder = -1 # The order of previous char @done = false # If this flag is set to constants.True, detection is done and conclusion has been made end