From bfc3815de6de982f6784016156a0de3f9ee68dcb Mon Sep 17 00:00:00 2001 From: Yoshihiro Takahashi Date: Wed, 5 Mar 2025 21:42:50 +0100 Subject: [PATCH] mail/bsfilter: Update to 1.0.20 - Remove the local patch because it was obtained from the old OSDN repository and the new version includes all the changes - Change MASTER_SITES to Github - Update WWW Changelog: https://github.com/nbkenichi/bsfilter/compare/v1.0.19...v1.0.20 --- mail/bsfilter/Makefile | 17 +- mail/bsfilter/distinfo | 5 +- mail/bsfilter/files/patch-bsfilter_bsfilter | 5075 ------------------- 3 files changed, 13 insertions(+), 5084 deletions(-) delete mode 100644 mail/bsfilter/files/patch-bsfilter_bsfilter diff --git a/mail/bsfilter/Makefile b/mail/bsfilter/Makefile index ea29b285cc29..9b874ffdb1c4 100644 --- a/mail/bsfilter/Makefile +++ b/mail/bsfilter/Makefile @@ -1,26 +1,29 @@ PORTNAME= bsfilter -PORTVERSION= 1.0.19 -PORTREVISION= 4 +DISTVERSIONPREFIX= v +DISTVERSION= 1.0.20 CATEGORIES= mail ruby -MASTER_SITES= OSDN/bsfilter MAINTAINER= nyan@FreeBSD.org COMMENT= Bayesian spam filter written in Ruby -WWW= https://osdn.net/projects/bsfilter/ +WWW= https://github.com/nbkenichi/bsfilter LICENSE= GPLv2 RUN_DEPENDS= rubygem-gdbm>=2.0.0,2:databases/rubygem-gdbm \ rubygem-sdbm>=1.0.0:databases/rubygem-sdbm -USES= ruby shebangfix tar:tgz -SHEBANG_FILES= bsfilter/bsfilter +USES= ruby shebangfix +SHEBANG_FILES= src/bsfilter.rb + +USE_GITHUB= yes +GH_ACCOUNT= nbkenichi + NO_BUILD= yes OPTIONS_DEFINE= EXAMPLES do-install: - ${INSTALL_SCRIPT} ${WRKSRC}/bsfilter/${PORTNAME} ${STAGEDIR}${PREFIX}/bin/${PORTNAME} + ${INSTALL_SCRIPT} ${WRKSRC}/src/bsfilter.rb ${STAGEDIR}${PREFIX}/bin/${PORTNAME} @${MKDIR} ${STAGEDIR}${EXAMPLESDIR} .for FILE in bsfilter.conf.sample dot-qmail.sample ${INSTALL_DATA} ${FILESDIR}/${FILE} ${STAGEDIR}${EXAMPLESDIR} diff --git a/mail/bsfilter/distinfo b/mail/bsfilter/distinfo index 34c3fcd2b09a..8d4020051465 100644 --- a/mail/bsfilter/distinfo +++ b/mail/bsfilter/distinfo @@ -1,2 +1,3 @@ -SHA256 (bsfilter-1.0.19.tgz) = 8aa1d713cc848b20d678eb7a5f24bec1879860d023701644bfd426a587998ac9 -SIZE (bsfilter-1.0.19.tgz) = 78660 +TIMESTAMP = 1740387732 +SHA256 (nbkenichi-bsfilter-v1.0.20_GH0.tar.gz) = 10fb704f3528f3a81ed350c42d0980d9104ed11b366d85a5795fbbc6e04b91db +SIZE (nbkenichi-bsfilter-v1.0.20_GH0.tar.gz) = 78005 diff --git a/mail/bsfilter/files/patch-bsfilter_bsfilter b/mail/bsfilter/files/patch-bsfilter_bsfilter deleted file mode 100644 index ff5d3d2ca636..000000000000 --- a/mail/bsfilter/files/patch-bsfilter_bsfilter +++ /dev/null @@ -1,5075 +0,0 @@ ---- bsfilter/bsfilter.orig 2013-11-03 10:22:15 UTC -+++ bsfilter/bsfilter -@@ -1,6 +1,6 @@ - #! /usr/bin/env ruby --## -*-Ruby-*- $Id: bsfilter,v 1.87 2013/11/03 10:22:15 nabeken Exp $ --## Copyright (C) 2003, 2004, 2005, 2006 NABEYA Kenichi -+## -*-Ruby-*- $Id: bsfilter,v 1.89 2023/12/26 05:52:39 nabeken Exp $ -+## Copyright (C) 2003-2023 NABEYA Kenichi - ## - ## This program is free software; you can redistribute it and/or modify - ## it under the terms of the GNU General Public License as published by -@@ -16,115 +16,112 @@ - ## along with this program; if not, write to the Free Software - ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -+require 'English' - require 'getoptlong' - require 'nkf' - - class Bsfilter - def initialize -- @threads = Array::new -+ @threads = [] - @token_dbs = nil -- @options = Hash::new -- @db_hash = Hash::new -+ @options = {} -+ @db_hash = {} - @jtokenizer = nil - end - attr_accessor :token_dbs - -- Release = "$Name: release_1_0_19 $".split[1].sub(/\A[^\d]*/, '').gsub(/_/, '.') -- Release.concat("-") if (Release == "") -- Revision = "$Revision: 1.87 $".gsub(/[^\.\d]/, '') -- Languages = ["C", "ja"] -- Default_Language = "C" -+ Release = '$Name: $'.split[1].sub(/\A[^\d]*/, '').gsub(/_/, '.') -+ Release.concat('-') if (Release == '') -+ Revision = '$Revision: 1.89 $'.gsub(/[^.\d]/, '') -+ Languages = %w[C ja].freeze -+ Default_Language = 'C'.freeze - --## Options = Hash::new # used like a global variable --## DB = Hash::new -- -- Default_header_prefix = "Spam" -- Default_spam_subject_prefix = "[SPAM] " -- Default_refer_header = -- ["Ufrom", "From", "To", "Cc", "Subject", "Reply-to", "Return-path", "Received", -- "Content-Transfer-Encoding", "Content-Type", "charset", "Content-Disposition"].join(",") -- -- Default_jtokenizer = "bigram" -- Default_mark_in_token = "|!*'" -- Default_homedir = ".bsfilter" -- Default_conf_file = "bsfilter.conf" -- Default_pid_file = "bsfilter.pid" -- -- Default_method = "rf" # Robinson Fisher -- Default_db = "sdbm" -- Default_max_mail = 10000 -+ ## Options = Hash::new # used like a global variable -+ ## DB = Hash::new -+ -+ Default_header_prefix = 'Spam'.freeze -+ Default_spam_subject_prefix = '[SPAM] '.freeze -+ Default_refer_header = -+ %w[Ufrom From To Cc Subject Reply-to Return-path Received -+ Content-Transfer-Encoding Content-Type charset Content-Disposition].join(',') -+ -+ Default_jtokenizer = 'bigram'.freeze -+ Default_mark_in_token = "|!*'".freeze -+ Default_homedir = '.bsfilter'.freeze -+ Default_conf_file = 'bsfilter.conf'.freeze -+ Default_pid_file = 'bsfilter.pid'.freeze -+ -+ Default_method = 'rf'.freeze # Robinson Fisher -+ Default_db = 'sdbm'.freeze -+ Default_max_mail = 10_000 - Default_min_mail = 8000 - Default_max_line = 500 -- -- Default_pop_proxy_if = "0.0.0.0" -- Default_pop_port = "110" -- Default_pop_proxy_port = "10110" -- Default_pop_max_size = 50000 -- -- Default_imap_port = "143" -- Default_imap_auth = "auto" -- Default_imap_auth_preference = ["cram-md5", "login", "loginc"] - -- Default_icon_number = 32512 -- -- Clean_ext = ".clean" -- Spam_ext = ".spam" -- Prob_ext = ".prob" -- Lock_ext = ".lock" -- -- SDBM_ext = ".sdbm" -- GDBM_ext = ".gdbm" -- BDB1_ext = ".bdb1" -- BDB_ext = ".bdb" -- QDBM_ext = ".qdbm" -- -+ Default_pop_proxy_if = '0.0.0.0'.freeze -+ Default_pop_port = '110'.freeze -+ Default_pop_proxy_port = '10110'.freeze -+ Default_pop_max_size = 50_000 -+ -+ Default_imap_port = '143'.freeze -+ Default_imap_auth = 'auto'.freeze -+ Default_imap_auth_preference = %w[cram-md5 login loginc].freeze -+ -+ Default_icon_number = 32_512 -+ -+ Clean_ext = '.clean'.freeze -+ Spam_ext = '.spam'.freeze -+ Prob_ext = '.prob'.freeze -+ Lock_ext = '.lock'.freeze -+ -+ NDBM_ext = '.ndbm'.freeze -+ SDBM_ext = '.sdbm'.freeze -+ GDBM_ext = '.gdbm'.freeze -+ BDB1_ext = '.bdb1'.freeze -+ BDB_ext = '.bdb'.freeze -+ QDBM_ext = '.qdbm'.freeze -+ - EXIT_NORMAL = 0 - CODE_NORMAL = true - CODE_SPAM = true - CODE_CLEAN = false -- -- CODESET_EUCJP = "eucJP" -- CODESET_LATIN = "ISO8859-1" -- CODESET_GB18030 = "GB18030" -- CODESET_UTF8 = "UTF-8" -- PATTERN_UTF8 = '[\xe0-\xef][\x80-\xbf][\x80-\xbf][\xe0-\xef][\x80-\xbf][\x80-\xbf]' -- RE_UTF8 = Regexp.new(PATTERN_UTF8, nil, 'n') -- -- ALL_TAGS = ["html", "head", "title", "meta", "body", "div", "spam", -- "h1", "h2", "h3", "h4", "h5", "h6", -- "em", "strong", "font", "basefont", "big", "small", -- "b", "i", "s", "u", "tt", "sub", "sub", -- "rb", "rp", "rt","ruby", -- "blink", "marquee", -- "dfn", "cite", "abbr", "acronym", -- "blockquote", "q", -- "br", "pre", "ins", "del", "center", "style", "hr", -- "ul", "ol", "li", "dl", "dt", "dd", -- "table", "caption", "thead", "tbody", "tfoot", -- "colgroup", "col", "tr", "td", "th", -- "a", "link", "base", "img", "address", -- "form", "input", "select", "option", "textarea", "label", -- "fieldset", "legend", "optgroup", -- "frameset", "frame", "nofrmaes", "iframe"].join('|') -- -- SPACE_TAGS = "br|p|td|tr|table|ul|ol|dl|li|dt|dd" -- -- RE_ALL_TAGS = Regexp::compile('\A<(' + ALL_TAGS + ')\b', Regexp::IGNORECASE, 'n') -- RE_SPACE_TAGS = Regexp::compile('\A<(' + SPACE_TAGS + ')\b', Regexp::IGNORECASE, 'n') -- -- SOCKET_TIMEOUT = 30 # for single socket operation -- -+ -+ LOG_CODESET = 'UTF-8'.freeze # codeset for verbose and debug message. nil => no conversion -+ -+ ALL_TAGS = %w[html head title meta body div spam -+ h1 h2 h3 h4 h5 h6 -+ em strong font basefont big small -+ b i s u tt sub sub -+ rb rp rt ruby -+ blink marquee -+ dfn cite abbr acronym -+ blockquote q -+ br pre ins del center style hr -+ ul ol li dl dt dd -+ table caption thead tbody tfoot -+ colgroup col tr td th -+ a link base img address -+ form input select option textarea label -+ fieldset legend optgroup -+ frameset frame nofrmaes iframe].join('|') -+ -+ SPACE_TAGS = 'br|p|td|tr|table|ul|ol|dl|li|dt|dd'.freeze -+ -+ RE_ALL_TAGS = Regexp.compile('\A<(' + ALL_TAGS + ')\b', Regexp::IGNORECASE) -+ RE_SPACE_TAGS = Regexp.compile('\A<(' + SPACE_TAGS + ')\b', Regexp::IGNORECASE) -+ -+ SOCKET_TIMEOUT = 30 # for single socket operation -+ - module Bsutil - def insert_header!(buf, header, content) - buf[0] =~ /([\r\n]*)\z/ -- eol = $1 -- -- (0 ... buf.length).each do |i| -- if ((i == 0) && # unix from line -+ eol = ::Regexp.last_match(1) -+ -+ (0...buf.length).each do |i| -+ if (i.zero? && # unix from line - (buf[i] =~ /\A>?from\s+(\S+)/)) - next -- elsif (buf[i] =~/\A(.*?:)/) -- h = $1 -+ elsif (buf[i] =~ /\A(.*?:)/) -+ h = ::Regexp.last_match(1) - if (h == header) - buf[i] = "#{header} #{content}#{eol}" - return -@@ -134,7 +131,7 @@ class Bsfilter - elsif (buf[i] =~ /\A[\r\n]*\z/) # separator between header and body - buf[i, 0] = "#{header} #{content}#{eol}" - return -- else # not header. may be body without separator -+ else # not header. may be body without separator - buf[i, 0] = "#{header} #{content}#{eol}" - return - end -@@ -144,17 +141,17 @@ class Bsfilter - - def append_header!(buf, header, prefix) - buf[0] =~ /([\r\n]*)\z/ -- eol = $1 -+ eol = ::Regexp.last_match(1) - append_done = false -- (0 ... buf.length).each do |i| -- if (buf[i] =~/\A(.*?:)(\s*)(.*?)([\r\n]*)\z/) -- h = $1 -- org_content = $3 -+ (0...buf.length).each do |i| -+ if (buf[i] =~ /\A(.*?:)(\s*)(.*?)([\r\n]*)\z/) -+ h = ::Regexp.last_match(1) -+ org_content = ::Regexp.last_match(3) - if (h.downcase == header.downcase) - buf[i] = "#{header} #{prefix}#{org_content}#{eol}" - append_done = true - end -- elsif ((! append_done) && -+ elsif (!append_done && - (((buf[i] =~ /\A\S/) && (buf[i] !~ /\A\S+:/)) || # found body without separator - (buf[i] =~ /\A[\r\n]*\z/))) # separator between header and body - buf[i, 0] = "#{header} #{prefix}#{eol}" -@@ -166,104 +163,96 @@ class Bsfilter - end - - def x_spam_flag -- return sprintf("X-%s-Flag:", @options["header-prefix"]) -+ return format('X-%s-Flag:', @options['header-prefix']) - end -- -+ - def x_spam_probability -- return sprintf("X-%s-Probability:", @options["header-prefix"]) -+ return format('X-%s-Probability:', @options['header-prefix']) - end -- -+ - def x_spam_revision -- return sprintf("X-%s-Revision:", @options["header-prefix"]) -+ return format('X-%s-Revision:', @options['header-prefix']) - end -- -- def insert_headers!(buf, spam_flag, probability=nil) -+ -+ def insert_headers!(buf, spam_flag, probability = nil) - updated = false -- if (@options["insert-revision"]) -+ if (@options['insert-revision']) - insert_header!(buf, x_spam_revision, "bsfilter release #{Release} revision #{Revision}") - updated = true - end -- if (@options["insert-flag"]) -+ if (@options['insert-flag']) - updated = true -- if (spam_flag) -- insert_header!(buf, x_spam_flag, "Yes") -+ if spam_flag -+ insert_header!(buf, x_spam_flag, 'Yes') - else -- insert_header!(buf, x_spam_flag, "No") -+ insert_header!(buf, x_spam_flag, 'No') - end - end -- if (@options["insert-probability"] && probability) -+ if (@options['insert-probability'] && probability) - updated = true -- insert_header!(buf, x_spam_probability, sprintf("%f", probability)) -+ insert_header!(buf, x_spam_probability, format('%f', probability)) - end -- if (@options["mark-spam-subject"]) -+ if (@options['mark-spam-subject']) - updated = true -- if (spam_flag) -- append_header!(buf, "Subject:", @options["spam-subject-prefix"]) -- end -+ append_header!(buf, 'Subject:', @options['spam-subject-prefix']) if spam_flag - end - return updated - end -- end # end of module -+ end - - include Bsutil - - class DevNull -- def sync=(*args) -- end -- def print(*args) -- end -- def printf(*args) -- end -+ def sync=(*args); end -+ -+ def print(*args); end -+ -+ def printf(*args); end - end - - class DBHash < Hash -- def flatten(magic="###", head="", &block) -- self.each do |k, v| -- if (v.class == DBHash) -- if (head == "") -+ def flatten(magic = '###', head = '', &block) -+ each do |k, v| -+ if v.instance_of?(DBHash) -+ if (head == '') - v.flatten(magic, k, &block) - else - v.flatten(magic, head + magic + k, &block) - end -+ elsif (head == '') -+ yield k, v - else -- if (head == "") -- yield k, v -- else -- yield head + magic + k, v -- end -+ yield head + magic + k, v - end - end - end -- -+ - def add(hash) - hash.each do |k, v| - if (self[k]) -- if ((self[k].class == DBHash) && -- (v.class == DBHash)) -+ if (self[k].instance_of?(DBHash) && -+ v.instance_of?(DBHash)) - self[k].add(v) - else - self[k] += v - end - else -- self[k] = v # should do deep copy ? -+ self[k] = v # should do deep copy ? - end - end - end -+ - def sub(hash) - hash.each do |k, v| - if (self[k]) -- if ((self[k].class == DBHash) && -- (v.class == DBHash)) -+ if (self[k].instance_of?(DBHash) && -+ v.instance_of?(DBHash)) - self[k].sub(v) -- if (self[k].empty?) -- self.delete(k) -- end -+ delete(k) if self[k].empty? -+ elsif (self[k] > v) -+ self[k] -= v - else -- if (self[k] > v) -- self[k] -= v -- else -- self.delete(k) -- end -+ delete(k) - end - end - end -@@ -271,38 +260,38 @@ class Bsfilter - end - - def safe_require(file) -- begin -- require file -- return true -- rescue LoadError -- return false -- end -+ require file -+ return true -+ rescue LoadError -+ return false - end - - def latin2ascii(str) - str.force_encoding('ASCII-8BIT') - newstr = str.tr("\x92\x93\x94".force_encoding('ASCII-8BIT'), "'''") -- newstr.tr!("\xc0-\xc5\xc8-\xcb\xcc-\xcf\xd2-\xd6\xd9-\xdc".force_encoding('ASCII-8BIT'), "AAAAAAEEEEIIIIOOOOOUUUU") -- newstr.tr!("\xe0-\xe5\xe8-\xeb\xec-\xef\xf2-\xf6\xf9-\xfc".force_encoding('ASCII-8BIT'), "aaaaaaeeeeiiiiooooouuuu") -+ newstr.tr!("\xc0-\xc5\xc8-\xcb\xcc-\xcf\xd2-\xd6\xd9-\xdc".force_encoding('ASCII-8BIT'), 'AAAAAAEEEEIIIIOOOOOUUUU') -+ newstr.tr!("\xe0-\xe5\xe8-\xeb\xec-\xef\xf2-\xf6\xf9-\xfc".force_encoding('ASCII-8BIT'), 'aaaaaaeeeeiiiiooooouuuu') - return newstr - end - - def u2eucjp(str) -- return NKF::nkf('-e -E -X -Z0', str.encode('EUC-JP', 'UTF-8', :undef => :replace, :invalid => :replace)) -+ return NKF.nkf('-e -E -X -Z0', str.encode('EUC-JP', 'UTF-8', undef: :replace, invalid: :replace)) - end -+ - def u2latin(str) -- return str.encode('US-ASCII', 'UTF-8', :undef => :replace, :invalid => :replace) -+ return str.encode('US-ASCII', 'UTF-8', undef: :replace, invalid: :replace) - end -+ - def gb180302eucjp(str) -- return str.encode('EUC-JP', 'BIG5', :undef => :replace, :invalid => :replace) -+ return str.encode('EUC-JP', 'BIG5', undef: :replace, invalid: :replace) - end -- -+ - def open_ro(file) -- if (file == "-") -- fh = STDIN -+ if (file == '-') -+ fh = $stdin - yield fh -- elsif (file.class == Array) -- file.instance_eval < 0) -+ -+ def set_f(a, power = 1) -+ if a.positive? - @mant = 1 -- @exp = Math::log(a) * power -- elsif (a < 0) -+ @exp = Math.log(a) * power -+ elsif a.negative? - @mant = -1 -- @exp = Math::log(-a) * power -+ @exp = Math.log(-a) * power - else - @mant = 0 - @exp = 0 -@@ -390,24 +380,24 @@ EOM - self - end - end -- -- -+ - module TokenAccess - def check_size(max_size, min_size) - if ((@file_count <= max_size) || (max_size <= 0) || (min_size <= 0)) - return false - end -+ - old_count = @file_count -- if (@options["verbose"]) -- @options["message-fh"].printf("reduce token database %s from %d to %d\n", @filename, old_count, min_size) -+ if (@options['verbose']) -+ @options['message-fh'].printf("reduce token database %s from %d to %d\n", @filename, old_count, min_size) - end -- -+ - key_cts.each do |(category, token)| -- if (category != ".internal") -+ if (category != '.internal') - v = value(category, token) || 0 - sub_scalar(category, token, (v * (old_count - min_size).to_f / old_count.to_f).ceil) -- if (@options["debug"] && ! value(category, token)) -- @options["message-fh"].printf("deleted %s %s\n", category, token) -+ if (@options['debug'] && ! value(category, token)) -+ @options['message-fh'].printf("deleted %s %s\n", category, token.to_utf8) - end - end - end -@@ -415,41 +405,47 @@ EOM - @dirty = true - return true - end -- -+ - def value_with_degene(category, token) -- if (value(category, token)) -+ if value(category, token) - return value(category, token) -- elsif (! @options["degeneration"]) # no degeneration -+ elsif (!@options['degeneration']) # no degeneration - return nil - else -- if (v = value(category, token[0 .. -2])) # cut last char -- return v -+ if (v = value(category, token[0..-2])) # cut last char -+ return v - end -- token = token.gsub(Regexp::compile("[#{@options['mark-in-token']}]"), '') -+ -+ token = token.gsub(Regexp.compile("[#{@options['mark-in-token']}]"), '') - if (v = value(category, token)) -- return v -+ return v - end -+ - token = token.downcase - if (v = value(category, token)) -- return v -+ return v - end -+ - token = token.upcase - if (v = value(category, token)) -- return v -+ return v - end -+ - token = token.capitalize - if (v = value(category, token)) -- return v -+ return v - end -+ - return nil - end - end -+ - def set_scalar(category, token, val) - @dirty = true - @file_count += 1 - set(category, token, val) - end -- -+ - def add_scalar(category, token, val) - @dirty = true - @file_count += 1 -@@ -459,58 +455,58 @@ EOM - set(category, token, val) - end - end -- -+ - def show_new_token(db) - db.each_ct do |category, token| -- if (! value(category, token) || (value(category, token) == 0)) -- @options["message-fh"].printf("new %s %s\n", category, token) -+ if (!value(category, token) || value(category, token).zero?) -+ @options['message-fh'].printf("new %s %s\n", category, token.to_utf8) - end - end - end -- -+ - def values -- array = Array::new -+ array = [] - each_ct do |c, t| - array.push(value(c, t)) - end - return array - end -- -+ - def key_cts -- array = Array::new -+ array = [] - each_ct do |c, t| - array.push([c, t]) - end - return array - end -- -+ - def export(fh) - each_ct do |category, token| -- fh.printf("%s %s %s %g\n", @language, category, token, value(category, token)) if (value(category, token)) -+ fh.printf("%s %s %s %g\n", @language, category, token, value(category, token)) if value(category, token) - end - end - end -- -+ - class TokenDB - include TokenAccess -- -- def initialize(language=nil) -- @hash = DBHash::new -+ -+ def initialize(language = nil) -+ @hash = DBHash.new - @file_count = 0 - @language = language -- @message_id = "-" -+ @message_id = '-' - @probability = nil - @spam_flag = nil - @dirty = false - @time = nil -- @filename = "-" -+ @filename = '-' - end - attr_accessor :hash, :file_count, :probability, :language, :spam_flag, :message_id, :time, :filename -- -+ - def size - @hash.size - end -- -+ - def each_ct - @hash.each_key do |category| - @hash[category].each_key do |token| -@@ -518,9 +514,9 @@ EOM - end - end - end -- -+ - def value(category, token) -- if (! @hash[category]) -+ if (!@hash[category]) - return nil - elsif (v = @hash[category][token]) - return v -@@ -528,14 +524,14 @@ EOM - return nil - end - end -- -+ - def set(category, token, v) - @dirty = true -- @hash[category] = DBHash::new if (! @hash[category]) -+ @hash[category] = DBHash.new if (! @hash[category]) - @hash[category][token] = v - end -- -- def print_keys_to_str(hash, separator, fh=STDOUT) -+ -+ def print_keys_to_str(hash, separator, fh = $stdout) - hash.keys.sort.each do |k| - v = hash[k] - v = v.to_i -@@ -543,57 +539,49 @@ EOM - fh.print(([k] * v).join(separator)) - end - end -- -+ - def clear - @dirty = true - @file_count = 0 -- @hash = DBHash::new -+ @hash = DBHash.new - end -- -+ - def add_db(db) - @dirty = true - @file_count += db.file_count -- if (! @language && db.language) -- @language = db.language -- end -+ @language = db.language if (!@language && db.language) - @hash.add(db.hash) - end -- -+ - def add_hash(hash) - @dirty = true - @file_count += 1 - @hash.add(hash) - end -- -+ - def sub_scalar(category, token, val) -- if (@file_count > 0) -- @file_count -= 1 -- end -- @hash.sub({category => {token => val}}) -+ @file_count -= 1 if @file_count.positive? -+ @hash.sub({ category => { token => val } }) - end -- -+ - def sub_hash(hash) - @dirty = true -- if (@file_count > 0) -- @file_count -= 1 -- end -+ @file_count -= 1 if @file_count.positive? - @hash.sub(hash) - end -- -+ - def sub_db(db) - @dirty = true - @file_count -= db.file_count -- if (@file_count < 1) -- @file_count = 1 -- end -+ @file_count = 1 if (@file_count < 1) - @hash.sub(db.hash) - end - end -- -+ - class TokenDBM - include TokenAccess -- MAGIC = "###" -- def initialize(options, language, ext) -+ MAGIC = '###'.freeze -+ def initialize(options, language, _ext) - @options = options - @dbm = nil # SDBM not Hash - @dirty = nil # not used. for TokenAccess -@@ -602,13 +590,13 @@ EOM - @language = language - end - attr_accessor :file_count -- -+ - def size - @dbm.size - end -- -+ - def to_db -- token_db = TokenDB::new(@language) -+ token_db = TokenDB.new(@language) - @dbm.each do |ct, v| - (category, token) = ct.split(Regexp.new(MAGIC), 2) - token_db.set(category, token, v) -@@ -616,25 +604,25 @@ EOM - end - return token_db - end -- -+ - def clear - @dbm.clear - @file_count = 0 -- set(".internal", "file_count", 0) -+ set('.internal', 'file_count', 0) - end -- -+ - def each_ct - @dbm.each_key do |ct| - (category, token) = ct.force_encoding('ASCII-8BIT').split(Regexp.new(MAGIC), 2) - yield(category, token) if (category && token) - end - end -- -+ - def add_db(token_db) - add_hash(token_db.hash) - @file_count += + token_db.file_count - end -- -+ - def add_hash(hash) - @dirty = true - hash.flatten(MAGIC) do |k, v| -@@ -645,15 +633,16 @@ EOM - end - end - end -- -+ - def sub_db(token_db) - sub_hash(token_db.hash) - if (@file_count > token_db.file_count) - @file_count -= token_db.file_count - else -- @file_count= 0 -+ @file_count = 0 - end - end -+ - def sub_hash(hash) - @dirty = true - hash.flatten(MAGIC) do |k, v| -@@ -666,24 +655,27 @@ EOM - end - end - end -- -+ - def value(category, token) - v = @dbm[category + MAGIC + token] -- if (v) -- return v.to_f -- else -- return nil -- end -+ return v.to_f if v -+ -+ return nil - end -- -+ - def set(category, token, v) - @dirty = true -- @dbm[category + MAGIC + token] = v.to_s -+ begin -+ @dbm[category + MAGIC + token] = v.to_s -+ rescue -+ @options['message-fh'].puts($ERROR_INFO.inspect, category + MAGIC + token, v.to_s) if (@options['verbose']) -+ @options['message-fh'].puts($ERROR_POSITION) if (@options['debug']) -+ end - end -- -+ - def sub_scalar(category, token, v) - @dirty = true -- if (@file_count > 0) -+ if (@file_count.positive?) - @file_count -= 1 - end - oldv = value(category, token) -@@ -695,121 +687,174 @@ EOM - end - end - end -- -- def open(mode="r") -- @lockfh = File::open(@lockfile, "w+") -+ -+ def open(mode = 'r') -+ @lockfh = File.open(@lockfile, 'w+') - case mode -- when "r" -+ when 'r' - begin - @lockfh.flock(File::LOCK_SH) - rescue Errno::EINVAL ## Win9x doesn't support LOCK_SH - @lockfh.flock(File::LOCK_EX) - end -- when "w", "wr", "rw" -+ when 'w', 'wr', 'rw' - @lockfh.flock(File::LOCK_EX) - else - raise "internal error: unknown mode #{mode}" - end -- -- @dbm = open_dbm(@filename, 0600) -- -- if (v = value(".internal", "file_count")) -+ -+ @dbm = open_dbm(@filename, 0o600) -+ -+ if (v = value('.internal', 'file_count')) - @file_count = v.to_i - else - @file_count = 0 -- set(".internal", "file_count", @file_count) -+ set('.internal', 'file_count', @file_count) - end -- if (@options["verbose"]) -- @options["message-fh"].printf("open %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, Process::pid) -+ if (@options['verbose']) -+ @options['message-fh'].printf("open %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, -+ Process.pid) - end - @dirty = false - end -- -+ - def close - dirty = @dirty -- set(".internal", "file_count", @file_count) if (dirty) -- if (@options["verbose"]) -- @options["message-fh"].printf("close %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, Process::pid) -+ set('.internal', 'file_count', @file_count) if dirty -+ if (@options['verbose']) -+ @options['message-fh'].printf("close %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, -+ Process.pid) - end -- if (@options["debug"] && dirty) -+ if (@options['debug'] && dirty) - key_cts.sort.each do |(c, t)| -- @options["message-fh"].printf("%s %s %s %f\n", @filename, c, t, value(c, t)) -+ @options['message-fh'].printf("%s %s %s %f\n", @filename, c, t.to_utf8, value(c, t)) - end - end - @dbm.close -- -+ - @lockfh.flock(File::LOCK_UN) - @lockfh.close - @dirty = false - end - end -- -+ -+ class TokenNDBM < TokenDBM -+ def initialize(options, language, ext) -+ @filename = options['homedir'] + language + ext + NDBM_ext -+ @lockfile = options['homedir'] + language + ext + NDBM_ext + Lock_ext -+ super -+ end -+ -+ def clear -+ @file_count = 0 -+ @dbm.close -+ begin -+ if (@options['verbose']) -+ @options['message-fh'].printf("unlink %s by %d.\n", @filename, Process.pid) -+ end -+ File.unlink(@filename + '.db') -+ rescue -+ @options['message-fh'].puts($ERROR_INFO.inspect) if (@options['verbose']) -+ @options['message-fh'].puts($ERROR_POSITION) if (@options['debug']) -+ end -+ @dbm = open_dbm(@filename, 0o600) -+ if (@options['verbose']) -+ @options['message-fh'].printf("reopen %s by %d.\n", @filename, Process.pid) -+ end -+ end -+ -+ def open_dbm(filename, mode) -+ DBM.open(filename, mode) -+ end -+ end -+ - class TokenSDBM < TokenDBM - def initialize(options, language, ext) -- @filename = options["homedir"] + language + ext + SDBM_ext -- @lockfile = options["homedir"] + language + ext + SDBM_ext + Lock_ext -+ @filename = options['homedir'] + language + ext + SDBM_ext -+ @lockfile = options['homedir'] + language + ext + SDBM_ext + Lock_ext - super - end -+ - def clear - @file_count = 0 - @dbm.close - begin -- File::unlink(@filename + ".dir") -- File::unlink(@filename + ".pag") -+ if (@options['verbose']) -+ @options['message-fh'].printf("unlink %s by %d.\n", @filename, Process.pid) -+ end -+ File.unlink(@filename + '.dir') -+ File.unlink(@filename + '.pag') - rescue -+ @options['message-fh'].puts($ERROR_INFO.inspect) if (@options['verbose']) -+ @options['message-fh'].puts($ERROR_POSITION) if (@options['debug']) - end -- @dbm = open_dbm(@filename, 0600) -- if (@options["verbose"]) -- @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) -+ @dbm = open_dbm(@filename, 0o600) -+ if (@options['verbose']) -+ @options['message-fh'].printf("reopen %s by %d.\n", @filename, Process.pid) - end - end -+ - def open_dbm(filename, mode) -- SDBM::open(filename, mode) -+ SDBM.open(filename, mode) - end - end -- -+ - class TokenGDBM < TokenDBM - def initialize(options, language, ext) - @options = options -- @filename = @options["homedir"] + language + ext + GDBM_ext -- @lockfile = @options["homedir"] + language + ext + GDBM_ext + Lock_ext -+ @filename = @options['homedir'] + language + ext + GDBM_ext -+ @lockfile = @options['homedir'] + language + ext + GDBM_ext + Lock_ext - super - end -+ - def clear - @file_count = 0 - @dbm.close - begin -- File::unlink(@filename) -+ if (@options['verbose']) -+ @options['message-fh'].printf("unlink %s by %d.\n", @filename, Process.pid) -+ end -+ File.unlink(@filename) - rescue -+ @options['message-fh'].puts($ERROR_INFO.inspect) if (@options['verbose']) -+ @options['message-fh'].puts($ERROR_POSITION) if (@options['debug']) - end -- @dbm = open_dbm(@filename, 0600) -- if (@options["verbose"]) -- @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) -+ @dbm = open_dbm(@filename, 0o600) -+ if (@options['verbose']) -+ @options['message-fh'].printf("reopen %s by %d.\n", @filename, Process.pid) - end - end -+ - def open_dbm(filename, mode) -- GDBM::open(filename, mode, GDBM::NOLOCK) -+ GDBM.open(filename, mode, GDBM::NOLOCK) - end - end -- -+ - class TokenBDB1 < TokenDBM - def initialize(options, language, ext) -- @filename = options["homedir"] + language + ext + BDB1_ext -- @lockfile = options["homedir"] + language + ext + BDB1_ext + Lock_ext -+ @filename = options['homedir'] + language + ext + BDB1_ext -+ @lockfile = options['homedir'] + language + ext + BDB1_ext + Lock_ext - super - end -+ - def clear - @file_count = 0 - @dbm.close - begin -- File::unlink(@filename) -+ if (@options['verbose']) -+ @options['message-fh'].printf("unlink %s by %d.\n", @filename, Process.pid) -+ end -+ File.unlink(@filename) - rescue -+ @options['message-fh'].puts($ERROR_INFO.inspect) if (@options['verbose']) -+ @options['message-fh'].puts($ERROR_POSITION) if (@options['debug']) - end -- @dbm = open_dbm(@filename, 0600) -- if (@options["verbose"]) -- @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) -+ @dbm = open_dbm(@filename, 0o600) -+ if (@options['verbose']) -+ @options['message-fh'].printf("reopen %s by %d.\n", @filename, Process.pid) - end - end -+ - def open_dbm(filename, mode) - BDB1::Hash.open(filename, BDB1::CREATE | BDB1::WRITE, mode) - end -@@ -817,22 +862,29 @@ EOM - - class TokenBDB < TokenDBM - def initialize(options, language, ext) -- @filename = options["homedir"] + language + ext + BDB_ext -- @lockfile = options["homedir"] + language + ext + BDB_ext + Lock_ext -+ @filename = options['homedir'] + language + ext + BDB_ext -+ @lockfile = options['homedir'] + language + ext + BDB_ext + Lock_ext - super - end -+ - def clear - @file_count = 0 - @dbm.close - begin -- File::unlink(@filename) -+ if (@options['verbose']) -+ @options['message-fh'].printf("unlink %s by %d.\n", @filename, Process.pid) -+ end -+ File.unlink(@filename) - rescue -+ @options['message-fh'].puts($ERROR_INFO.inspect) if (@options['verbose']) -+ @options['message-fh'].puts($ERROR_POSITION) if (@options['debug']) - end -- @dbm = open_dbm(@filename, 0600) -- if (@options["verbose"]) -- @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) -+ @dbm = open_dbm(@filename, 0o600) -+ if (@options['verbose']) -+ @options['message-fh'].printf("reopen %s by %d.\n", @filename, Process.pid) - end - end -+ - def open_dbm(filename, mode) - BDB::Hash.open(filename, nil, BDB::CREATE, mode) - end -@@ -840,64 +892,67 @@ EOM - - class TokenQDBM < TokenDBM - def initialize(options, language, ext) -- @filename = options["homedir"] + language + ext + QDBM_ext -- @lockfile = options["homedir"] + language + ext + QDBM_ext + Lock_ext -+ @filename = options['homedir'] + language + ext + QDBM_ext -+ @lockfile = options['homedir'] + language + ext + QDBM_ext + Lock_ext - super - end -+ - def value(category, token) -- begin -- v = @dbm[category + MAGIC + token] -- rescue DepotError_ENOITEM -- return nil -- else -- return v.to_f -- end -+ v = @dbm[category + MAGIC + token] -+ rescue DepotError_ENOITEM -+ return nil -+ else -+ return v.to_f - end -+ - def add_hash(hash) - @dirty = true - hash.flatten(MAGIC) do |k, v| -- begin -- if (@dbm[k]) -- @dbm[k] = (@dbm[k].to_f + v.to_f).to_s -- else -- ## nerver reached. DepotError_ENOITEM asserted when @dbm[k] is nil -- @dbm[k] = v.to_s -- end -- rescue DepotError_ENOITEM -+ if (@dbm[k]) -+ @dbm[k] = (@dbm[k].to_f + v.to_f).to_s -+ else - @dbm[k] = v.to_s - end - end - end -+ - def clear - @file_count = 0 - @dbm.close - begin -- File::unlink(@filename) -+ if (@options['verbose']) -+ @options['message-fh'].printf("unlink %s by %d.\n", @filename, Process.pid) -+ end -+ File.unlink(@filename) - rescue -+ @options['message-fh'].puts($ERROR_INFO.inspect) if (@options['verbose']) -+ @options['message-fh'].puts($ERROR_POSITION) if (@options['debug']) - end -- @dbm = open_dbm(@filename, 0600) -- if (@options["verbose"]) -- @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid) -+ @dbm = open_dbm(@filename, 0o600) -+ if (@options['verbose']) -+ @options['message-fh'].printf("reopen %s by %d.\n", @filename, Process.pid) - end - end -- def open_dbm(filename, mode) -- Depot::open(filename, Depot::OWRITER | Depot::OCREAT) -+ -+ def open_dbm(filename, _mode) -+ Depot.open(filename, Depot::OWRITER | Depot::OCREAT) - end - end - - def get_lang_from_headers(headers) -- reg_char_ja = Regexp::compile('\?(iso-2022-jp|iso-2202-jp|x.sjis|shift.jis|euc.jp)\?', Regexp::IGNORECASE, 'n') -- reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old -- @options["refer-header"].keys.each do |header_name| -+ reg_char_ja = Regexp.compile('\?(iso-2022-jp|iso-2202-jp|x.sjis|shift.jis|euc.jp)\?', Regexp::IGNORECASE) -+ reg_jis = Regexp.compile('\\x1b\\x24[\\x42\\x40]', nil) # escape sequence to jisx0208 new and old -+ -+ @options['refer-header'].each_key do |header_name| - str = headers[header_name] - if (str) - case str - when reg_char_ja -- @options["message-fh"].printf("lang ja header char_ja\n") if (@options["debug"]) -- return ["ja", nil] -+ @options['message-fh'].printf("lang ja header char_ja\n") if (@options['debug']) -+ return ['ja', nil] - when reg_jis -- @options["message-fh"].printf("lang ja header jis\n") if (@options["debug"]) -- return ["ja", "jis"] -+ @options['message-fh'].printf("lang ja header jis\n") if (@options['debug']) -+ return %w[ja jis] - end - end - end -@@ -908,72 +963,59 @@ EOM - return get_lang(buf, html_flag) - end - -- def get_lang(buf, html_flag=false) --## reg_euc = Regexp::compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}", nil, 'e') # kana in euc-jp without zenkaku-space -- reg_euc = Regexp::compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}".force_encoding('EUC-JP')) --## reg_sjis = Regexp::compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}", nil, 's') # kana in shift-jis -- reg_sjis = Regexp::compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}".force_encoding('SHIFT_JIS')) -- --## reg_utf8 = Regexp::compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}", nil, 'u') # kana in utf8 -- reg_utf8 = Regexp::compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}".force_encoding('UTF-8')) -- --## reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old -- reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]".force_encoding('ASCII-8BIT')) --## reg_gb18030_possible = Regexp::compile('[\x80-\x9f]', nil, 'n') -- reg_gb18030_possible = Regexp::compile('[\x80-\x9f]'.force_encoding('ASCII-8BIT')) -- --## reg_char_utf8 = Regexp::compile('(^\w+: .*|charset="?)(utf-8)', Regexp::IGNORECASE, 'n') --## reg_cte_bin = Regexp::compile('\Acontent-transfer-encoding\s*:\s*(base64|quoted-printable)', Regexp::IGNORECASE, 'n') --## reg_c = Regexp::compile('(^\w+: .*|charset="?)(ks_c_5601|euc-kr|big5|gb2312)', Regexp::IGNORECASE, 'n') -- -+ def get_lang(buf, html_flag) -+ reg_euc = Regexp.compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}".force_encoding('EUC-JP')) -+ reg_sjis = Regexp.compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}".force_encoding('SHIFT_JIS')) -+ reg_utf8 = Regexp.compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}".force_encoding('UTF-8')) -+ reg_jis = Regexp.compile('\\x1b\\x24[\\x42\\x40]'.force_encoding('ASCII-8BIT')) -+ reg_gb18030_possible = Regexp.compile('[\x80-\x9f]'.force_encoding('ASCII-8BIT')) - gb18030_possible = false - buf.each do |str| -- if (html_flag) -- str = decode_character_reference2u(str) -- end -- if (str.force_encoding('ASCII-8BIT') =~ reg_gb18030_possible) -- gb18030_possible = true -- end -+ str = decode_character_reference2u(str) if html_flag -+ gb18030_possible = true if (str.force_encoding('ASCII-8BIT') =~ reg_gb18030_possible) - -- str_utf8 = str.encode('UTF-16BE', 'UTF-8', :undef => :replace, :invalid => :replace).encode('UTF-8', 'UTF-16BE', :undef => :replace, :invalid => :replace) -- str_sjis = str.encode('UTF-16BE', 'SHIFT_JIS', :undef => :replace, :invalid => :replace).encode('SHIFT_JIS', 'UTF-16BE', :undef => :replace, :invalid => :replace) -- str_euc = str.encode('UTF-16BE', 'EUC-JP', :undef => :replace, :invalid => :replace).encode('EUC-JP', 'UTF-16BE', :undef => :replace, :invalid => :replace) -+ str_utf8 = str.encode('UTF-16BE', 'UTF-8', undef: :replace, invalid: :replace).encode('UTF-8', 'UTF-16BE', -+ undef: :replace, invalid: :replace) -+ str_sjis = str.encode('UTF-16BE', 'SHIFT_JIS', undef: :replace, invalid: :replace).encode('SHIFT_JIS', -+ 'UTF-16BE', undef: :replace, invalid: :replace) -+ str_euc = str.encode('UTF-16BE', 'EUC-JP', undef: :replace, invalid: :replace).encode('EUC-JP', 'UTF-16BE', -+ undef: :replace, invalid: :replace) - - if (str_utf8 =~ reg_utf8) -- @options["message-fh"].printf("lang ja utf8\n") if (@options["debug"]) -- return ["ja", "utf8"] -+ @options['message-fh'].printf("lang ja utf8\n") if (@options['debug']) -+ return %w[ja utf8] - elsif (str.force_encoding('ASCII-8BIT') =~ reg_jis) -- @options["message-fh"].printf("lang ja jis\n") if (@options["debug"]) -- return ["ja", "jis"] -+ @options['message-fh'].printf("lang ja jis\n") if (@options['debug']) -+ return %w[ja jis] - elsif (str_sjis =~ reg_sjis) -- @options["message-fh"].printf("lang ja sjis\n") if (@options["debug"]) -- return ["ja", "sjis"] -+ @options['message-fh'].printf("lang ja sjis\n") if (@options['debug']) -+ return %w[ja sjis] - elsif (str_euc =~ reg_euc) -- if (gb18030_possible) -- @options["message-fh"].printf("lang ja gb18030\n") if (@options["debug"]) -- return ["ja", "gb18030"] -+ if gb18030_possible -+ @options['message-fh'].printf("lang ja gb18030\n") if (@options['debug']) -+ return %w[ja gb18030] - else -- @options["message-fh"].printf("lang ja euc\n") if (@options["debug"]) -- return ["ja", "euc"] -+ @options['message-fh'].printf("lang ja euc\n") if (@options['debug']) -+ return %w[ja euc] - end - end - end - return [nil, nil] - end -- -+ - def get_headers(buf, lang) -- headers = DBHash::new -+ headers = DBHash.new - buf = buf.dup -- header_buf = Array::new -+ header_buf = [] - if ((buf[0] !~ /\A>?from\s+(\S+)/i) && # this isn't mail - (buf[0] !~ /\A(\S+):/)) -- if (@options["max-line"] <= 0) -+ if (@options['max-line'] <= 0) - return [headers, buf, lang] - else -- return [headers, buf[0 .. @options["max-line"]], lang] -+ return [headers, buf[0..@options['max-line']], lang] - end - end -- -+ - num_of_dquote = 0 - ignore_dquote = false - -@@ -981,35 +1023,33 @@ EOM - header_buf.push(str) - str = str.chomp - if (str =~ /\A(\S+?):\s*(.*)/) -- current = $1.downcase -- if (current == "received") -- headers[current] = $2.sub(/[\r\n]*\z/, '') -+ current = ::Regexp.last_match(1).downcase -+ if (current == 'received') -+ headers[current] = ::Regexp.last_match(2).sub(/[\r\n]*\z/, '') - else -- headers[current] = (headers[current] || "") + " " + $2.sub(/[\r\n]*\z/, '') -+ headers[current] = (headers[current] || '') + ' ' + ::Regexp.last_match(2).sub(/[\r\n]*\z/, '') - end - elsif (str =~ /\A>?from\s+(\S+)/i) -- headers["ufrom"] = $1 -- elsif (str =~ /\A[\r\n]*\z/ && (ignore_dquote || (num_of_dquote % 2 == 0))) # separator between header and body -+ headers['ufrom'] = ::Regexp.last_match(1) -+ elsif (str =~ /\A[\r\n]*\z/ && (ignore_dquote || num_of_dquote.even?)) # separator between header and body - break -- elsif (str =~ /\A\S/ && (ignore_dquote || (num_of_dquote % 2 == 0))) # found body without separator -- buf.push(str) # rewind -+ elsif (str =~ /\A\S/ && (ignore_dquote || num_of_dquote.even?)) # found body without separator -+ buf.push(str) # rewind - break -- elsif (! current) -+ elsif !current - break -+ elsif (str =~ /\A\s*=\?/) -+ headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, '') - else -- if (str =~ /\A\s*=\?/) -- headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, '') -- else -- headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, ' ') -- end -+ headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, ' ') - end - ## start count on from, to and cc line - ## contiune count while number of dquote is odd -- if ((current =~ /\A(from|to|cc)\z/) || (num_of_dquote % 2 != 0)) -- num_of_dquote = num_of_dquote + str.scan(/\"/).length - str.scan(/\\\"/).length -+ if ((current =~ /\A(from|to|cc)\z/) || num_of_dquote.odd?) -+ num_of_dquote = num_of_dquote + str.scan(/"/).length - str.scan(/\\"/).length - end - -- if (buf.empty? && ! ignore_dquote) # retry? -+ if (buf.empty? && ! ignore_dquote) # retry? - ignore_dquote = true - buf.concat(header_buf) - header_buf.clear -@@ -1017,45 +1057,38 @@ EOM - end - end - -- if ((headers["content-type"] =~ /\bboundary=\s*"(.*?)"/i) || -- (headers["content-type"] =~ /\bboundary=\s*'(.*?)'/i) || -- (headers["content-type"] =~ /\bboundary=([^\s;]+)/i)) -- headers["boundary"] = $1 -+ if ((headers['content-type'] =~ /\bboundary=\s*"(.*?)"/i) || -+ (headers['content-type'] =~ /\bboundary=\s*'(.*?)'/i) || -+ (headers['content-type'] =~ /\bboundary=([^\s;]+)/i)) -+ headers['boundary'] = ::Regexp.last_match(1) - end -- if (headers["content-type"] =~ /charset=([\'\"]*)([^\s\1\;]+)\1/i) -- headers["charset"] = $2 -- end -- if (headers["content-type"] =~ /\A([^;]+)/) -- headers["content-type"] = $1 -- end -- -- if (@options["max-line"] <= 0) -+ headers['charset'] = ::Regexp.last_match(2) if (headers['content-type'] =~ /charset=(['"]*)([^\s\1;]+)\1/i) -+ headers['content-type'] = ::Regexp.last_match(1) if (headers['content-type'] =~ /\A([^;]+)/) -+ -+ if (@options['max-line'] <= 0) - return [headers, buf, lang] - else -- return [headers, buf[0 .. @options["max-line"]], lang] -+ return [headers, buf[0..@options['max-line']], lang] - end - end -- -- -+ - class Jtokenizer - def initialize(method) - case method -- when "bigram" -- @method = Proc::new {|s| bigram(s)} -- when "block" -- @method = Proc::new {|s| block(s)} -- when "mecab" -- @method = Proc::new {|s| mecab(s)} -+ when 'bigram' -+ @method = proc { |s| bigram(s) } -+ when 'block' -+ @method = proc { |s| block(s) } -+ when 'mecab' -+ @method = proc { |s| mecab(s) } - meishi_euc = "\xcc\xbe\xbb\xec".force_encoding('ASCII-8BIT') - meishi_sjis = meishi_euc.encode('SHIFT_JIS', 'EUC-JP').force_encoding('ASCII-8BIT') - meishi_utf8 = meishi_euc.encode('UTF-8', 'EUC-JP').force_encoding('ASCII-8BIT') -+ @m = MeCab::Tagger.new('-Ochasen') -+ node = @m.parseToNode('this is a pen') - if (defined?(MeCab::VERSION)) # defined after 0.90 -- @m = MeCab::Tagger.new("-Ochasen") -- node = @m.parseToNode('this is a pen') - hinshi = node.next.feature.force_encoding('ASCII-8BIT').split(/,/)[0] - else -- @m = MeCab::Tagger.new("-Ochasen") -- node = @m.parseToNode('this is a pen') - hinshi = node.next.getFeature.force_encoding('ASCII-8BIT').split(/,/)[0] - end - case hinshi -@@ -1066,13 +1099,13 @@ EOM - when meishi_utf8 - @m_dic_enc = Encoding::UTF_8 - else -- @m_dic_enc = Encoding::default_external -+ @m_dic_enc = Encoding.default_external - end -- when "chasen" -- Chasen.getopt("-F", '%H %m\n', "-j") -- @method = Proc::new {|s| chasen(s)} -- when "kakasi" -- @method = Proc::new {|s| kakasi(s)} -+ when 'chasen' -+ Chasen.getopt('-F', '%H %m\n', '-j') -+ @method = proc { |s| chasen(s) } -+ when 'kakasi' -+ @method = proc { |s| kakasi(s) } - else - raise "internal error: unknown method #{method}" - end -@@ -1081,42 +1114,30 @@ EOM - def split(str) - @method.call(str) - end -- --## Reg_kanji = Regexp::compile("[\xb0\xa1-\xf4\xa4]+", nil, 'e') -- Reg_kanji = Regexp::compile("[\xb0\xa1-\xf4\xa4]+".force_encoding('EUC-JP')) -- Reg_kanji_ASCII_8BIT = Regexp::compile("[\xb0\xa1-\xf4\xa4]+".force_encoding('ASCII-8BIT')) --## Reg_katakana = Regexp::compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+", nil, 'e') -- Reg_katakana = Regexp::compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+".force_encoding('EUC-JP')) --## Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e') -- Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]".force_encoding('EUC-JP')) --# Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4 \xa1\xbc \xa5\xa1-\xa5\xf6]".force_encoding('ASCII-8BIT')) - --## Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e') -- Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]".force_encoding('EUC-JP')) --# Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]".force_encoding('ASCII-8BIT')) -- -+ Reg_kanji = Regexp.compile("[\xb0\xa1-\xf4\xa4]+".force_encoding('EUC-JP')) -+ Reg_katakana = Regexp.compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+".force_encoding('EUC-JP')) -+ Reg_kanji_katakana = Regexp.compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]".force_encoding('EUC-JP')) -+ Reg_not_kanji_katakana = Regexp.compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]".force_encoding('EUC-JP')) -+ - def kakasi(str) - str = str.gsub(/[\x00-\x7f]/, ' ') -- if (str =~ /\A +\z/) -- return [] -- end -- array = Array::new -- Kakasi::kakasi("-oeuc -w", str).scan(/\S+/).each do |token| -+ return [] if (str =~ /\A +\z/) -+ -+ array = [] -+ Kakasi.kakasi('-oeuc -w', str).scan(/\S+/).each do |token| - token.gsub!(Reg_not_kanji_katakana, '') -- if ((token =~ Reg_kanji) || (token.length > 2)) -- array.push(token) -- end -+ array.push(token) if ((token =~ Reg_kanji) || (token.length > 2)) - end - return array - end -- -+ - def mecab(str) -- str = str.encode(@m_dic_enc, :invalid => :replace, :undef => :replace, :replace => ' ') -+ str = str.encode(@m_dic_enc, invalid: :replace, undef: :replace, replace: ' ') - str = str.gsub(/[\x00-\x7f]/, ' ') -- if (str.length == 0 || str =~ /\A +\z/) -- return [] -- end -- array = Array::new -+ return [] if (str.empty? || str =~ /\A +\z/) -+ -+ array = [] - node = @m.parseToNode(str) - while (node && - (defined?(MeCab::VERSION) || (node.hasNode == 1))) -@@ -1127,72 +1148,63 @@ EOM - token = node.getSurface.encode('EUC-JP', @m_dic_enc) - hinshi = node.getFeature.encode('EUC-JP', @m_dic_enc).split(/,/)[0] - end -- unless (token.valid_encoding?) -- # Scrub token -- token = token.each_char.map { |c| (c.valid_encoding?) ? c : "" }.join -+ unless token.valid_encoding? -+ # Scrub token -+ token = token.each_char.map { |c| c.valid_encoding? ? c : '' }.join - end - case hinshi -- when "BOS/EOS" -+ when 'BOS/EOS' - # Skip BOS/EOS - when "\xb5\xad\xb9\xe6".force_encoding('EUC-JP') - # Skip KIGOU - when "\xcc\xbe\xbb\xec".force_encoding('EUC-JP') - # MEISHI -- if ((token =~ Reg_kanji_katakana) || (token.bytesize > 2)) -- array.push(token) -- end -+ array.push(token) if ((token =~ Reg_kanji_katakana) || (token.bytesize > 2)) - else - token.gsub!(Reg_not_kanji_katakana, '') -- if ((token =~ Reg_kanji) || (token.bytesize > 2)) -- array.push(token) -- end -+ array.push(token) if ((token =~ Reg_kanji) || (token.bytesize > 2)) - end - node = node.next - end - return array - end -- -+ - def chasen(str) - str = str.gsub(/[\x00-\x7f]/, ' ') -- if (str =~ /\A +\z/) -- return [] -- end -- array = Array::new -+ return [] if (str =~ /\A +\z/) -+ -+ array = [] - Chasen.sparse(str).split("\n").each do |hinshi_token| -- if (hinshi_token =~ /(.*) (.*)/) -- hinshi = $1 -- token = $2 -- if (hinshi == "\xcc\xbe\xbb\xec") -- if ((token =~ Reg_kanji_katakana) || (token.length > 2)) -- array.push(token) -- end -- else -- token.gsub!(Reg_not_kanji_katakana, '') -- if ((token =~ Reg_kanji) || (token.length > 2)) -- array.push(token) -- end -- end -+ next unless (hinshi_token =~ /(.*) (.*)/) -+ -+ hinshi = ::Regexp.last_match(1) -+ token = ::Regexp.last_match(2) -+ if (hinshi == "\xcc\xbe\xbb\xec") -+ array.push(token) if ((token =~ Reg_kanji_katakana) || (token.length > 2)) -+ else -+ token.gsub!(Reg_not_kanji_katakana, '') -+ array.push(token) if ((token =~ Reg_kanji) || (token.length > 2)) - end - end - return array - end -- -+ - def block(str) - tokens = str.scan(Reg_kanji) - tokens.concat(str.scan(Reg_katakana)) - return tokens - end -- -+ - def bigram(str) -- tokens = Array::new -- -+ tokens = [] -+ - str.scan(Reg_kanji).each do |token| - case token.length - when 1, 2 - tokens.push(token) - else - l = token.length - 1 -- for i in (0 .. l) -+ (0..l).each do |i| - tokens.push(token[i, 2]) - end - end -@@ -1201,108 +1213,108 @@ EOM - return tokens - end - end -- -+ - def tokenize_headers(lang, headers) -- (lang, code) = get_lang_from_headers(headers) if (! lang) -+ (lang,) = get_lang_from_headers(headers) if (! lang) - -- head_db = TokenDB::new(lang) -- reg_token = Regexp::compile("\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+") -- -- if (headers["received"]) -- str = headers["received"] -- str =~ /envelope\-from\s+([\w@\.\-]+)/ -- efrom = $1 -- str =~ /for\s+<([\w@\.\-]+)>/ -- foraddress = $1 -+ head_db = TokenDB.new(lang) -+ reg_token = Regexp.compile("\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+") -+ -+ if (headers['received']) -+ str = headers['received'] -+ str =~ /envelope-from\s+([\w@.-]+)/ -+ efrom = ::Regexp.last_match(1) -+ str =~ /for\s+<([\w@.-]+)>/ -+ foraddress = ::Regexp.last_match(1) - str.sub!(/(\bid|;).*/im, '') -- str.sub!(/\(qmail[^\)]*\)/, '') -- str += " " + efrom if efrom -- str += " " + foraddress if foraddress -- headers["received"] = str -+ str.sub!(/\(qmail[^)]*\)/, '') -+ str += ' ' + efrom if efrom -+ str += ' ' + foraddress if foraddress -+ headers['received'] = str - end - --# if (headers["domainkey-signature"]) --# headers["domainkey-signature"] = headers["domainkey-signature"].sub(/b=[^:;\s]+/, '') --# end -+ # if (headers["domainkey-signature"]) -+ # headers["domainkey-signature"] = headers["domainkey-signature"].sub(/b=[^:;\s]+/, '') -+ # end - --# "authentication-results", "domainkey-signature" -+ # "authentication-results", "domainkey-signature" - headers.each do |header, content| -- if (@options["refer-all-header"] || @options["refer-header"][header]) -- if (lang == "ja") -- content.gsub!(/=\?utf\-8\?([bq])\?(\S*)\?=/i) do |s| -- b_or_q = $1 -- encoded_str = $2 -- if (@options["utf-8"]) -- if (b_or_q =~ /q/i) -- decoded_str = encoded_str.unpack("M*").to_s -- else -- decoded_str = encoded_str.unpack("m*").to_s -- end -- u2eucjp(decoded_str) -- else -- "" -- end -+ next unless (@options['refer-all-header'] || @options['refer-header'][header]) -+ -+ if (lang == 'ja') -+ content.gsub!(/=\?utf-8\?([bq])\?(\S*)\?=/i) do |_s| -+ b_or_q = ::Regexp.last_match(1) -+ encoded_str = ::Regexp.last_match(2) -+ if (@options['utf-8']) -+ decoded_str = if (b_or_q =~ /q/i) -+ encoded_str.unpack('M*').to_s -+ else -+ encoded_str.unpack('m*').to_s -+ end -+ u2eucjp(decoded_str) -+ else -+ '' - end -- content = NKF::nkf('-e -X -Z0', content.gsub(/\?(iso-2202-jp|shift-jis)\?/i, '?ISO-2022-JP?')) -- else -- content = latin2ascii(content) - end -+ content = NKF.nkf('-e -X -Z0', content.gsub(/\?(iso-2202-jp|shift-jis)\?/i, '?ISO-2022-JP?')) -+ else -+ content = latin2ascii(content) -+ end - -- unless (content.valid_encoding?) -- # Scrub str -- content = content.each_char.map { |c| (c.valid_encoding?) ? c : "" }.join -- end -+ unless content.valid_encoding? -+ # Scrub str -+ content = content.each_char.map { |c| c.valid_encoding? ? c : '' }.join -+ end - -- content.scan(reg_token).each do |token| -- head_db.add_scalar(header, token, 1) if (token.length < 20) -- @options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"]) -+ content.scan(reg_token).each do |token| -+ head_db.add_scalar(header, token, 1) if (token.length < 20) -+ @options['message-fh'].printf("tokenizer %s %s\n", header, token.to_utf8) if (@options['debug']) -+ end -+ if (lang == 'ja') -+ @jtokenizer.split(content.gsub(/\s+/, '')).each do |token| -+ token.force_encoding('ASCII-8BIT') -+ head_db.add_scalar(header, token, 1) -+ @options['message-fh'].printf("tokenizer %s %s\n", header, token.to_utf8) if (@options['debug']) - end -- if (lang == "ja") -- @jtokenizer.split(content.gsub(/\s+/, '')).each do |token| -- token.force_encoding('ASCII-8BIT') -- head_db.add_scalar(header, token, 1) -- @options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"]) -- end -- end - end - end - return head_db - end -- -+ - def tokenize_buf(buf) -- lang = nil # lang in unknown at first -- -- separators = Array::new -- delimiters = Array::new -+ lang = nil # lang in unknown at first -+ -+ separators = [] -+ delimiters = [] - (headers, buf, lang) = get_headers(buf, lang) -- if (headers.empty?) # this is not a mail -+ if headers.empty? # this is not a mail - (db, buf) = tokenize_body(lang, headers, buf, separators, delimiters) -- db.time = Time::new -- db.language = Default_Language if (! db.language) --## db.language = Default_Language if (@options["unified-db"]) -+ db.time = Time.new -+ db.language = Default_Language unless db.language -+ ## db.language = Default_Language if (@options["unified-db"]) - return db - end -- -- body_db = TokenDB::new(lang) -- body_db.message_id = headers["message-id"] || "-" -- -- sub_head_db = TokenDB::new(lang) -+ -+ body_db = TokenDB.new(lang) -+ body_db.message_id = headers['message-id'] || '-' -+ -+ sub_head_db = TokenDB.new(lang) - main_head_db = tokenize_headers(lang, headers) - lang = main_head_db.language if main_head_db -- -+ - found_html_part = false -- plain_bodies = Array::new -- html_bodies = Array::new -- -- while (! buf.empty?) -- separators.push("--" + headers["boundary"]) if (headers["boundary"]) -- delimiters.push("--" + headers["boundary"] + "--") if (headers["boundary"]) -- -- if ((! headers["content-type"]) || -- (headers["content-type"] !~ /rfc822/i)) -+ plain_bodies = [] -+ html_bodies = [] -+ -+ until buf.empty? -+ separators.push('--' + headers['boundary']) if (headers['boundary']) -+ delimiters.push('--' + headers['boundary'] + '--') if (headers['boundary']) -+ -+ if ((!headers['content-type']) || -+ (headers['content-type'] !~ /rfc822/i)) - (db, buf) = tokenize_body(lang, headers, buf, separators, delimiters) - lang = db.language -- if (headers["content-type"] =~ /html/i) -+ if (headers['content-type'] =~ /html/i) - found_html_part = true - html_bodies.push(db) - else -@@ -1313,138 +1325,129 @@ EOM - db = tokenize_headers(lang, headers) - sub_head_db.add_db(db) - end -- -- if (@options["ignore-plain-text-part"] && found_html_part) -- html_bodies.each do |db| -- body_db.add_db(db) -- end -- else # default -- html_bodies.each do |db| -- body_db.add_db(db) -- end -+ -+ html_bodies.each do |db| -+ body_db.add_db(db) -+ end -+ unless (@options['ignore-plain-text-part'] && found_html_part) # default - plain_bodies.each do |db| - body_db.add_db(db) - end - end -- -+ - body_db.add_db(main_head_db) - body_db.add_db(sub_head_db) - body_db.file_count = 1 -- body_db.time = Time::new -- body_db.language = Default_Language if (! body_db.language) --## body_db.language = Default_Language if (@options["unified-db"]) -+ body_db.time = Time.new -+ body_db.language = Default_Language unless body_db.language -+ ## body_db.language = Default_Language if (@options["unified-db"]) - return body_db - end -- -+ - def i2eucjp(i) -- u2eucjp([i].pack("U")) -+ u2eucjp([i].pack('U')) - end -- -+ - def i2ascii(i) -- latin2ascii(u2latin([i].pack("U"))) -+ latin2ascii(u2latin([i].pack('U'))) - end -- -+ - def i2u(i) -- [i].pack("U") -+ [i].pack('U') - end - - def decode_character_reference2u(str) -- reg = Regexp::compile("\&\#(\d{1,5}|x[\da-f]{1,4});".force_encoding('UTF-8'), Regexp::IGNORECASE) -- if (@options["utf-8"]) -- newstr = str.gsub(reg) do -- hex_or_dec = $1 -- if (hex_or_dec =~ /^x(.*)/i) -- hex_str = $1 -- i2u(hex_str.hex) -- else -- i2u(hex_or_dec.to_i) -- end -- end -- else -- newstr = str.gsub(reg, "") -- end -+ reg = Regexp.compile('\&\#(\d{1,5}|x[\da-f]{1,4});'.force_encoding('ASCII-8BIT'), Regexp::IGNORECASE) -+ newstr = if (@options['utf-8']) -+ str.gsub(reg) do -+ hex_or_dec = ::Regexp.last_match(1) -+ if (hex_or_dec =~ /^x(.*)/i) -+ hex_str = ::Regexp.last_match(1) -+ i2u(hex_str.hex).force_encoding('ASCII-8BIT') -+ else -+ i2u(hex_or_dec.to_i).force_encoding('ASCII-8BIT') -+ end -+ end -+ else -+ str.gsub(reg, '') -+ end - return newstr - end - - def decode_character_reference(str, lang) -- if (@options["utf-8"]) -- newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i) do -- hex_or_dec = $1 -- if (hex_or_dec =~ /^x(.*)/i) -- hex_str = $1 -- if (lang == "ja") -- i2eucjp(hex_str.hex) -- else -- i2ascii(hex_str.hex) -- end -- else -- if (lang == "ja") -- i2eucjp(hex_or_dec.to_i) -- else -- i2ascii(hex_or_dec.to_i) -- end -- end -- end -- else -- newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i, "") -- end -+ newstr = if (@options['utf-8']) -+ str.gsub(/&\#(\d{1,5}|x[\da-f]{1,4});/i) do -+ hex_or_dec = ::Regexp.last_match(1) -+ if (hex_or_dec =~ /^x(.*)/i) -+ hex_str = ::Regexp.last_match(1) -+ if (lang == 'ja') -+ i2eucjp(hex_str.hex) -+ else -+ i2ascii(hex_str.hex) -+ end -+ elsif (lang == 'ja') -+ i2eucjp(hex_or_dec.to_i) -+ else -+ i2ascii(hex_or_dec.to_i) -+ end -+ end -+ else -+ str.gsub(/&\#(\d{1,5}|x[\da-f]{1,4});/i, '') -+ end - return newstr - end -- -+ - def tokenize_str(str, lang) -- body_hash = DBHash::new(0) -- url_hash = DBHash::new(0) -- -- reg_token = Regexp::compile("(?:http:|www)[\\w\\-\\.\\/@%:\?=]+|[\\w\\-\\.]+@[\\w\\-\\.]+|\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+") -- reg_url = Regexp::compile('(^http:|https:|^www|@)') -- reg_token2 = Regexp::compile('\b\d[\d\.]+\d\b|[\w%]+') -- reg_noret = Regexp::compile('[\r\n]*\z') -- -- unless (str.valid_encoding?) -+ body_hash = DBHash.new(0) -+ url_hash = DBHash.new(0) -+ -+ reg_token = Regexp.compile("(?:http:|www)[\\w\\-\\.\\/@%:\?=]+|[\\w\\-\\.]+@[\\w\\-\\.]+|\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+") -+ reg_url = Regexp.compile('(^http:|https:|^www|@)') -+ reg_token2 = Regexp.compile('\b\d[\d\.]+\d\b|[\w%]+') -+ # reg_noret = Regexp::compile('[\r\n]*\z') -+ -+ unless str.valid_encoding? - # Scrub str -- str = str.each_char.map { |c| (c.valid_encoding?) ? c : "" }.join -+ str = str.each_char.map { |c| c.valid_encoding? ? c : '' }.join - end - - str.scan(reg_token).each do |token| - if (token =~ reg_url) - token.scan(reg_token2).each do |token2| - if (token2.length < 20) -- url_hash[token2] += 1 -- @options["message-fh"].printf("tokenizer %s %s\n", "url", token2) if (@options["debug"]) -+ url_hash[token2] += 1 -+ @options['message-fh'].printf("tokenizer %s %s\n", 'url', token2.to_utf8) if (@options['debug']) - end - end - elsif (token.length < 20) -- body_hash[token] += 1 -- @options["message-fh"].printf("tokenizer C %s %s\n", "body", token) if (@options["debug"]) -+ body_hash[token] += 1 -+ @options['message-fh'].printf("tokenizer C %s %s\n", 'body', token.to_utf8) if (@options['debug']) - end - end -- -- if (lang == "ja") --# str.gsub!(Regexp::compile("^[ -\\~]*[\|\>]+", nil, 'e'), '') # delete cite mark -- str.gsub!(Regexp::compile("^[ -\\~]*[\|\>]+".force_encoding('EUC-JP')), '') --# str.gsub!(Regexp::compile("^[ \\t\xa1\xa1]+", nil, 'e'), '') # delete white space -- str.gsub!(Regexp::compile("^[ \\t\xa1\xa1]+".force_encoding('EUC-JP')), '') # delete white space --# str.gsub!(Regexp::compile("(\\r?\\n){2,}", nil, 'e'), ' ') # keep multiple newline as space -- str.gsub!(Regexp::compile("(\\r?\\n){2,}".force_encoding('EUC-JP')), ' ') # keep multiple newline as space --# str.gsub!(Regexp::compile("[\\r\\n]+", nil, 'e'), '') # delete newline -- str.gsub!(Regexp::compile("[\\r\\n]+".force_encoding('EUC-JP')), '') # delete newline -+ -+ if (lang == 'ja') -+ str.gsub!(Regexp.compile("^[ -\\~]*[\|\>]+".force_encoding('EUC-JP')), '') -+ str.gsub!(Regexp.compile("^[ \\t\xa1\xa1]+".force_encoding('EUC-JP')), '') # delete white space -+ str.gsub!(Regexp.compile('(\\r?\\n){2,}'.force_encoding('EUC-JP')), ' ') # keep multiple newline as space -+ str.gsub!(Regexp.compile('[\\r\\n]+'.force_encoding('EUC-JP')), '') # delete newline - str.split.each do |s| - @jtokenizer.split(s).each do |token| - token.force_encoding('ASCII-8BIT') - body_hash[token] += 1 -- @options["message-fh"].printf("tokenizer ja %s %s\n", "body", token) if (@options["debug"]) -+ @options['message-fh'].printf("tokenizer ja %s %s\n", 'body', token.to_utf8) if (@options['debug']) - end - end - end - return [body_hash, url_hash] - end -- -+ - def base64_encoded?(buf) - [buf.dup, buf.reverse].each do |b| - while (str = b.shift) -- if (str =~ /\A[\.\s\r\n]*\z/) -+ # if (str =~ /\A[\.\s\r\n]*\z/) -+ if (str =~ /\A[.\s]*\z/) - next -- elsif (str =~ /\A[A-z0-9=+\/]+[\s\r\n]*\z/) -+ elsif (str =~ %r{\A[A-z0-9=+/]+\s*\z}) - break - else - return false -@@ -1455,19 +1458,19 @@ EOM - end - - def tokenize_body(lang, headers, body, separators, delimiters) -- reg_return_codes = Regexp::compile('[\r\n]*\z') -- -- db = TokenDB::new(lang) -+ reg_return_codes = Regexp.compile('[\r\n]*\z') -+ -+ db = TokenDB.new(lang) - body = body.dup -- -- buf = Array::new -- -+ -+ buf = [] -+ - delimiter = delimiters.last - separator = separators.last -- -- if (separators.empty?) -+ -+ if separators.empty? - buf = body -- body = Array::new -+ body = [] - else - while (str = body.shift) - str_noret = str.sub(reg_return_codes, '') -@@ -1485,177 +1488,166 @@ EOM - end - end - end -- -- if (headers["content-type"] && headers["content-type"] !~ /text/i) -- return [db, body] # skip non-text body -+ -+ if (headers['content-type'] && headers['content-type'] !~ /text/i) -+ return [db, body] # skip non-text body - end -- -- case headers["content-transfer-encoding"] -+ -+ case headers['content-transfer-encoding'] - when /base64/i -- if (base64_encoded?(buf)) --## buf.map! {|str| str.unpack("m*").to_s} -- buf = buf.join.gsub(/[\r\n]/, '').unpack("m*") -+ if base64_encoded?(buf) -+ ## buf.map! {|str| str.unpack("m*").to_s} -+ buf = buf.join.gsub(/[\r\n]/, '').unpack('m*') - end - when /quoted-printable/i -- buf.map! {|str| str.unpack("M*").join} -+ buf.map! { |str| str.unpack('M*').join } - end - - lang_backup = lang -- if (headers["content-type"] =~ /html/i) -+ if (headers['content-type'] =~ /html/i) - (lang, code) = get_lang_from_buf(buf, true) - else - (lang, code) = get_lang_from_buf(buf, false) - end -- if (! lang) -- lang = lang_backup -- end -+ lang ||= lang_backup - - str = buf.join - str.gsub!(/^begin[^\r\n]+(([\r\n]+M)([^\r\n]+))*/, '') # remove uuencoded lines - -- if (lang == "ja") -- if (code == "utf8") -- if (@options["utf-8"]) -+ if (lang == 'ja') -+ if (code == 'utf8') -+ if (@options['utf-8']) - str = u2eucjp(str) - else -- lang = Default_Language # can't use iconv / stop ja tokenizer -+ lang = Default_Language # can't use iconv / stop ja tokenizer - end -- elsif (code == "gb18030") -- if (@options["utf-8"]) -+ elsif (code == 'gb18030') -+ if (@options['utf-8']) - str = gb180302eucjp(str) - else - lang = Default_Language - end - else -- str = NKF::nkf('-e -X -Z0', str) -+ str = NKF.nkf('-e -X -Z0', str) - end - else - str = latin2ascii(str) - end - -- tags = Array::new -- if (headers["content-type"] =~ /html/i) -+ tags = [] -+ if (headers['content-type'] =~ /html/i) - # remove salad at head of part - encoding = str.encoding - str.force_encoding('ASCII-8BIT') -- if (str =~ Regexp::compile('\A[^<>]*?(<(\?xml|!doctype|html|body)\b.*)\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n')) -- str = $1 -+ if (str =~ Regexp.compile('\A[^<>]*?(<(\?xml|!doctype|html|body)\b.*)\z', -+ Regexp::MULTILINE | Regexp::IGNORECASE)) -+ str = ::Regexp.last_match(1) - end - - # remove salad in head, except style - if (str =~ /\A(.*?)([^<>]*<(?!\/style)/im, '><') -+ before_body_tag = ::Regexp.last_match(1) -+ after_body_tag = ::Regexp.last_match(2) -+ before_body_tag.gsub!(%r{>[^<>]*<(?!/style)}im, '><') - str = before_body_tag + after_body_tag - end -- -+ - # remove

-- str.gsub!(/(]*display\s*:\s*none[^>]*>)([^<>]*)(<\/span>)/im, '') -- -- if (@options["ignore-after-last-atag"]) -- if (str =~ /\A(.*)<\/a>/im) -- str = $1 -- end -- end -+ str.gsub!(%r{(]*display\s*:\s*none[^>]*>)([^<>]*)()}im, '') - -+ str = ::Regexp.last_match(1) if (@options['ignore-after-last-atag']) && (str =~ %r{\A(.*)}im) -+ - # remove salad after body or html -- if (str =~ Regexp::compile('\A(.*)[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n')) -- str = $1 -+ if (str =~ Regexp.compile('\A(.*)[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE)) -+ str = ::Regexp.last_match(1) - end -- if (str =~ Regexp::compile('\A(.*)[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n')) -- str = $1 -+ if (str =~ Regexp.compile('\A(.*)[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE)) -+ str = ::Regexp.last_match(1) - end -- str.gsub!(Regexp::compile('<[^>]*>', Regexp::MULTILINE, 'n')) do |t| -+ str.gsub!(Regexp.compile('<[^>]*>', Regexp::MULTILINE)) do |t| - t = t.gsub(/\n/, '') -- if (t =~ RE_ALL_TAGS) # end tags are thrown away -+ if (t =~ RE_ALL_TAGS) # end tags are thrown away - t.force_encoding(encoding) - tags.push(t) - end -- -+ - t.force_encoding('ASCII-8BIT') -+ t.force_encoding(encoding) - if (t =~ RE_SPACE_TAGS) -- t.force_encoding(encoding) -- " " -+ ' ' - else -- t.force_encoding(encoding) -- "" -+ '' - end - end - str.force_encoding(encoding) - body_str = decode_character_reference(str, lang) # out of tags - tag_str = decode_character_reference(tags.join, lang) # in tags -- else # if plain text -+ else # if plain text - body_str = str -- tag_str = "" -+ tag_str = '' - end - (body_hash, url_body_hash) = tokenize_str(body_str, lang) - (tag_hash, url_tag_hash) = tokenize_str(tag_str, lang) -- -- if (! body_hash.empty? && @options["use-body"]) -- db.add_hash({"body" => body_hash}) -- end -- if (! tag_hash.empty?) -- db.add_hash({"tag" => tag_hash}) -- end -- if (! url_body_hash.empty?) -- db.add_hash({"url" => url_body_hash}) -- end -- if (! url_tag_hash.empty?) -- db.add_hash({"url" => url_tag_hash}) -- end -+ -+ db.add_hash({ 'body' => body_hash }) if (!body_hash.empty? && @options['use-body']) -+ db.add_hash({ 'tag' => tag_hash }) unless tag_hash.empty? -+ db.add_hash({ 'url' => url_body_hash }) unless url_body_hash.empty? -+ db.add_hash({ 'url' => url_tag_hash }) unless url_tag_hash.empty? - db.file_count = 1 -- db.language = lang -+ db.language = lang - return [db, body] - end -- -- class Probability # for each lang -+ -+ # for each lang -+ class Probability - def initialize(options, lang) - @options = options -- @filename = @options["homedir"] + lang + Prob_ext -- case (@options["db"]) -- when "sdbm" -- @clean = TokenSDBM::new(@options, lang, Clean_ext) -- @spam = TokenSDBM::new(@options, lang, Spam_ext) -- @prob = TokenSDBM::new(@options, lang, Prob_ext) -- when "gdbm" -- @clean = TokenGDBM::new(@options, lang, Clean_ext) -- @spam = TokenGDBM::new(@options, lang, Spam_ext) -- @prob = TokenGDBM::new(@options, lang, Prob_ext) -- when "bdb1" -- @clean = TokenBDB1::new(@options, lang, Clean_ext) -- @spam = TokenBDB1::new(@options, lang, Spam_ext) -- @prob = TokenBDB1::new(@options, lang, Prob_ext) -- when "bdb" -- @clean = TokenBDB::new(@options, lang, Clean_ext) -- @spam = TokenBDB::new(@options, lang, Spam_ext) -- @prob = TokenBDB::new(@options, lang, Prob_ext) -- when "qdbm" -- @clean = TokenQDBM::new(@options, lang, Clean_ext) -- @spam = TokenQDBM::new(@options, lang, Spam_ext) -- @prob = TokenQDBM::new(@options, lang, Prob_ext) -+ @filename = @options['homedir'] + lang + Prob_ext -+ case (@options['db']) -+ when 'ndbm' -+ @clean = TokenNDBM.new(@options, lang, Clean_ext) -+ @spam = TokenNDBM.new(@options, lang, Spam_ext) -+ @prob = TokenNDBM.new(@options, lang, Prob_ext) -+ when 'sdbm' -+ @clean = TokenSDBM.new(@options, lang, Clean_ext) -+ @spam = TokenSDBM.new(@options, lang, Spam_ext) -+ @prob = TokenSDBM.new(@options, lang, Prob_ext) -+ when 'gdbm' -+ @clean = TokenGDBM.new(@options, lang, Clean_ext) -+ @spam = TokenGDBM.new(@options, lang, Spam_ext) -+ @prob = TokenGDBM.new(@options, lang, Prob_ext) -+ when 'bdb1' -+ @clean = TokenBDB1.new(@options, lang, Clean_ext) -+ @spam = TokenBDB1.new(@options, lang, Spam_ext) -+ @prob = TokenBDB1.new(@options, lang, Prob_ext) -+ when 'bdb' -+ @clean = TokenBDB.new(@options, lang, Clean_ext) -+ @spam = TokenBDB.new(@options, lang, Spam_ext) -+ @prob = TokenBDB.new(@options, lang, Prob_ext) -+ when 'qdbm' -+ @clean = TokenQDBM.new(@options, lang, Clean_ext) -+ @spam = TokenQDBM.new(@options, lang, Spam_ext) -+ @prob = TokenQDBM.new(@options, lang, Prob_ext) - end -- -+ - @language = lang - end -- -+ - attr_accessor :prob, :clean, :spam, :spam_cutoff, :language -- -+ - def merge_dbs_of_lang(token_dbs) -- new_db = TokenDB::new -+ new_db = TokenDB.new - token_dbs.each do |db| -- if (@language == db.language) -- new_db.add_db(db) -- end -+ new_db.add_db(db) if (@language == db.language) - end - return new_db - end - end -- -+ - class Graham < Probability - def initialize(options, lang) - @spam_cutoff = 0.9 -@@ -1666,79 +1658,79 @@ EOM - def product(a) - n = 1 - a.each do |v| -- n = n * v if (v != 0) -+ n *= v if (v != 0) - end - return n - end - - def get_combined_probability(token_db) -- prob_db = TokenDB::new # temporary -- -+ prob_db = TokenDB.new # temporary -+ - token_db.each_ct do |category, token| - probability = @prob.value_with_degene(category, token) -- if (probability) -+ if probability - prob_db.set_scalar(category, token, probability) - else - prob_db.set_scalar(category, token, @default_probability) # 0.4 - end - end -- -- probs = prob_db.values.sort {|a, b| (b - 0.5).abs <=> (a - 0.5).abs}[0, 15] -- -- if (@options["debug"]) -- prob_array = Array::new -+ -+ probs = prob_db.values.sort { |a, b| (b - 0.5).abs <=> (a - 0.5).abs }[0, 15] -+ -+ if (@options['debug']) -+ prob_array = [] - prob_db.each_ct do |c, t| - prob_array.push([[c, t], prob_db.value(c, t)]) - end -- prob_array.sort! {|a, b| (b[1] - 0.5).abs <=> (a[1] - 0.5).abs} -+ prob_array.sort! { |a, b| (b[1] - 0.5).abs <=> (a[1] - 0.5).abs } - prob_array = prob_array[0, 15] -- prob_array.sort! {|a, b| b[1] <=> a[1]} -+ prob_array.sort! { |a, b| b[1] <=> a[1] } - prob_array.each do |k, v| -- @options["message-fh"].printf("word probability %s %s %f\n", k[0], k[1], v) -+ @options['message-fh'].printf("word probability %s %s %f\n", k[0], k[1].to_str, v) - end - end -- -+ - prod = product(probs) -- token_db.probability = prod / (prod + product(probs.map {|x| 1 - x})) -- if (token_db.probability > @spam_cutoff) -- token_db.spam_flag = true -- else -- token_db.spam_flag = false -- end -+ token_db.probability = prod / (prod + product(probs.map { |x| 1 - x })) -+ token_db.spam_flag = if (token_db.probability > @spam_cutoff) -+ true -+ else -+ false -+ end - return token_db - end -- -+ - def update_probability(token_dbs) - c_count = [@clean.file_count, 1].max - s_count = [@spam.file_count, 1].max -- -- if (token_dbs.empty?) -+ -+ if token_dbs.empty? - incremental = false - target_cts = @clean.key_cts | @spam.key_cts -- @prob.open("w") -+ @prob.open('w') - @prob.clear - else - incremental = true - merged_db = merge_dbs_of_lang(token_dbs) - target_cts = merged_db.key_cts -- return if (target_cts.empty?) -- @prob.open("rw") -+ return if target_cts.empty? -+ -+ @prob.open('rw') - end - old_file_count = @prob.file_count - new_file_count = 0 -- -+ - cnum = c_count.to_f - snum = s_count.to_f -- -+ - target_cts.each do |(category, token)| - c_count = @clean.value(category, token) || 0 - s_count = @spam.value(category, token) || 0 -- update = false - if (incremental && @prob.value(category, token)) - @prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete - new_file_count -= 1 - end -- if (c_count == 0) -+ if c_count.zero? - if (s_count > 10) - new_file_count += 1 - @prob.set_scalar(category, token, 0.9999) -@@ -1746,7 +1738,7 @@ EOM - new_file_count += 1 - @prob.set_scalar(category, token, 0.9998) - end -- elsif (s_count == 0) -+ elsif s_count.zero? - if (c_count > 10) - new_file_count += 1 - @prob.set_scalar(category, token, 0.0001) -@@ -1764,81 +1756,79 @@ EOM - @prob.set_scalar(category, token, p) - end - end -- @prob.file_count = new_file_count + old_file_count if (incremental) -+ @prob.file_count = new_file_count + old_file_count if incremental - @prob.close - end - end -- -+ - class Robinson < Probability - def initialize(options, lang) - @robx_max = 1 - @min_dev = 0.1 - @spam_cutoff = 0.582 - @center = 0.5 -- @robs = 0.001 # from bogofilter/robinson.h -+ @robs = 0.001 # from bogofilter/robinson.h - @default_robx = 0.415 # from bogofilter/robinson.h / not used - super - end -- -- def get_pw(category, token, g, b) -+ -+ def get_pw(_category, _token, _g, _b) - return pw - end -- -- -+ - def update_probability(token_dbs) -- pwdb = TokenDB::new -+ pwdb = TokenDB.new - c_count = [@clean.file_count, 1].max - s_count = [@spam.file_count, 1].max -- -- if (token_dbs.empty?) -+ -+ if token_dbs.empty? - incremental = false - target_cts = @clean.key_cts | @spam.key_cts - else - incremental = true - merged_db = merge_dbs_of_lang(token_dbs) - target_cts = merged_db.key_cts -- return if (target_cts.empty?) -+ return if target_cts.empty? - end -- -+ - ## loop1 - ## get pw and robx(average of pw) - count = 0 - pw_sum = 0.0 -- -+ - good_mail = [1, @clean.file_count].max.to_f - bad_mail = [1, @spam.file_count].max.to_f - target_cts.each do |(category, token)| - g = [@clean.value(category, token) || 0, c_count].min - b = [@spam.value(category, token) || 0, s_count].min - n = g + b -- if (n == 0) -+ if n.zero? - pwdb.set_scalar(category, token, nil) # need to delete this token from prob.db - else - pw = (b / bad_mail) / (b / bad_mail + g / good_mail) -- if ((@robx_max == 0) || (n <= @robx_max)) -+ if (@robx_max.zero? || (n <= @robx_max)) - pw_sum += pw - count += 1 - end - pwdb.set_scalar(category, token, pw) - end - end -- -- if (incremental) -- @prob.open("rw") -+ -+ if incremental -+ @prob.open('rw') - old_file_count = @prob.file_count -- old_robx = @prob.value(".internal", "robx") || @default_robx -+ old_robx = @prob.value('.internal', 'robx') || @default_robx - robx = (pw_sum + old_file_count * old_robx) / (count + old_file_count) -- robs = @robs - else -- @prob.open("w") -+ @prob.open('w') - @prob.clear -- if (count != 0) -- robx = pw_sum / count -- else -- robx = @default_robx -- end -- robs = @robs -+ robx = if (count != 0) -+ pw_sum / count -+ else -+ @default_robx -+ end - end -+ robs = @robs - ## loop2 - ## get fw from pw - new_file_count = 0 -@@ -1849,83 +1839,85 @@ EOM - pw = pwdb.value(category, token) - if (incremental && @prob.value(category, token)) - new_file_count -= 1 -- @prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete -+ @prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete - end -- if (pw) -+ if pw - new_file_count += 1 - @prob.set_scalar(category, token, (robs * robx + n * pw) / (robs + n)) # fw - end - end -- @prob.set_scalar(".internal", "robx", robx) -- @prob.file_count = new_file_count + old_file_count if (incremental) -+ @prob.set_scalar('.internal', 'robx', robx) -+ @prob.file_count = new_file_count + old_file_count if incremental - @prob.close - end -- -+ - def get_probability(pminus, qminus, count) - r = 1.0 / [1, count].max -- p = 1.0 - Math::exp(pminus.ln * r) -- q = 1.0 - Math::exp(qminus.ln * r) -+ p = 1.0 - Math.exp(pminus.ln * r) -+ q = 1.0 - Math.exp(qminus.ln * r) - s = (1.0 + (p - q) / (p + q)) / 2.0 - return s - end -- -+ - def get_combined_probability(token_db) -- robx = @prob.value(".internal", "robx") || @default_robx -- -+ robx = @prob.value('.internal', 'robx') || @default_robx -+ - count = 0 -- pminus = FLOAT::new(1) -- qminus = FLOAT::new(1) -+ pminus = FLOAT.new(1) -+ qminus = FLOAT.new(1) - token_db.each_ct do |category, token| - probability = @prob.value_with_degene(category, token) || robx -- if ((probability - @center).abs > @min_dev) -- if (probability <= 0.0) -- probability = 0.0000001 -- elsif (probability >= 1.0) -- probability = 0.9999999 -- end -- c = token_db.value(category, token) -- count += c -- pminus = pminus * FLOAT::new(1.0 - probability, c) -- qminus = qminus * FLOAT::new(probability, c) -- @options["message-fh"].printf("word probability %s %s %d %f\n", category, token, c, probability) if (@options["debug"]) -+ next unless ((probability - @center).abs > @min_dev) -+ -+ if (probability <= 0.0) -+ probability = 0.0000001 -+ elsif (probability >= 1.0) -+ probability = 0.9999999 - end -+ c = token_db.value(category, token) -+ count += c -+ pminus *= FLOAT.new(1.0 - probability, c) -+ qminus *= FLOAT.new(probability, c) -+ if (@options['debug']) -+ @options['message-fh'].printf("word probability %s %s %d %f\n", category, token.to_utf8, c, -+ probability) -+ end - end -- -- if (count == 0) -- token_db.probability = 0.0 -- else -- token_db.probability = get_probability(pminus, qminus, count) -- end -- if (token_db.probability > @spam_cutoff) -- token_db.spam_flag = true -- else -- token_db.spam_flag = false -- end -+ -+ token_db.probability = if count.zero? -+ 0.0 -+ else -+ get_probability(pminus, qminus, count) -+ end -+ token_db.spam_flag = if (token_db.probability > @spam_cutoff) -+ true -+ else -+ false -+ end - return token_db - end - end -- -- -+ - class RobinsonFisher < Robinson - def initialize(options, lang) - super - @spam_cutoff = 0.95 - end -- -+ - def chi2q(x2, v) - m = x2 / 2.0 -- sum = Math::exp(0.0 - m) -- term = FLOAT::new -+ sum = Math.exp(0.0 - m) -+ term = FLOAT.new - term.exp = 0.0 - m - term.mant = 1 -- -- (1 .. (v / 2) - 1).each do |i| -- term = term * FLOAT::new(m / i) -+ -+ (1..(v / 2) - 1).each do |i| -+ term *= FLOAT.new(m / i) - sum += term.to_f - end - return sum < 1.0 ? sum : 1.0 - end -- -+ - def get_probability(pminus, qminus, count) - p = 1 - chi2q(-2.0 * pminus.ln, 2 * count) - q = 1 - chi2q(-2.0 * qminus.ln, 2 * count) -@@ -1933,199 +1925,198 @@ EOM - return s - end - end -- -+ - def init_dir(dir) -- if (! FileTest::directory?(dir)) -- Dir.mkdir(dir, 0700) -- end -+ return if FileTest.directory?(dir) -+ -+ Dir.mkdir(dir, 0o700) - end -- -- def soft_raise(str=nil) -- STDERR.puts str if (str) -- STDERR.puts "Try `#{File.basename($0)} --help' for more information." -+ -+ def soft_raise(str = nil) -+ warn str if str -+ warn "Try `#{File.basename($PROGRAM_NAME)} --help' for more information." - exit 2 - end - - def usage -- -- print < 0) -+ @options['languages'].each do |lang| -+ @db_hash[lang].clean.open('r') -+ @db_hash[lang].clean.export(fh) if @db_hash[lang].clean.file_count.positive? - @db_hash[lang].clean.close - end - end - end -- if (@options["export-spam"]) -+ if (@options['export-spam']) - open_wo(file) do |fh| -- @options["languages"].each do |lang| -- @db_hash[lang].spam.open("r") -- @db_hash[lang].spam.export(fh) if (@db_hash[lang].spam.file_count > 0) -+ @options['languages'].each do |lang| -+ @db_hash[lang].spam.open('r') -+ @db_hash[lang].spam.export(fh) if @db_hash[lang].spam.file_count.positive? - @db_hash[lang].spam.close - end - end - end -- if (@options["export-probability"]) -- open_wo(file) do |fh| -- @options["languages"].each do |lang| -- @db_hash[lang].prob.open("r") -- @db_hash[lang].prob.export(fh) if (@db_hash[lang].prob.file_count > 0) -- @db_hash[lang].prob.close -- end -+ return unless (@options['export-probability']) -+ -+ open_wo(file) do |fh| -+ @options['languages'].each do |lang| -+ @db_hash[lang].prob.open('r') -+ @db_hash[lang].prob.export(fh) if @db_hash[lang].prob.file_count.positive? -+ @db_hash[lang].prob.close - end - end - end -- -+ - def setup_imap -- Net::IMAP.class_eval < true} -- -+ ['--icon-number', GetoptLong::REQUIRED_ARGUMENT], -+ ['--ssl', GetoptLong::NO_ARGUMENT], -+ ['--ssl-cert', GetoptLong::REQUIRED_ARGUMENT], -+ ['--pop', GetoptLong::NO_ARGUMENT], -+ ['--tasktray', GetoptLong::NO_ARGUMENT], -+ ['--pop-proxy-set', GetoptLong::REQUIRED_ARGUMENT], -+ ['--pop-server', GetoptLong::REQUIRED_ARGUMENT], -+ ['--pop-port', GetoptLong::REQUIRED_ARGUMENT], -+ ['--pop-proxy-if', GetoptLong::REQUIRED_ARGUMENT], -+ ['--pop-proxy-port', GetoptLong::REQUIRED_ARGUMENT], -+ ['--pop-user', GetoptLong::REQUIRED_ARGUMENT], -+ ['--pop-max-size', GetoptLong::REQUIRED_ARGUMENT], -+ ['--imap', GetoptLong::NO_ARGUMENT], -+ ['--imap-server', GetoptLong::REQUIRED_ARGUMENT], -+ ['--imap-port', GetoptLong::REQUIRED_ARGUMENT], -+ ['--imap-auth', GetoptLong::REQUIRED_ARGUMENT], -+ ['--imap-user', GetoptLong::REQUIRED_ARGUMENT], -+ ['--imap-password', GetoptLong::REQUIRED_ARGUMENT], -+ ['--imap-folder-clean', GetoptLong::REQUIRED_ARGUMENT], -+ ['--imap-folder-spam', GetoptLong::REQUIRED_ARGUMENT], -+ ['--imap-fetch-unseen', GetoptLong::NO_ARGUMENT], -+ ['--imap-fetch-unflagged', GetoptLong::NO_ARGUMENT], -+ ['--imap-reset-seen-flag', GetoptLong::NO_ARGUMENT], -+ ['--homedir', GetoptLong::REQUIRED_ARGUMENT], -+ ['--config-file', GetoptLong::REQUIRED_ARGUMENT], -+ ['--pid-file', GetoptLong::REQUIRED_ARGUMENT], -+ ['--db', GetoptLong::REQUIRED_ARGUMENT], -+ ## ["--unified-db", GetoptLong::NO_ARGUMENT], -+ ['--max-line', GetoptLong::REQUIRED_ARGUMENT], -+ ['--export-clean', GetoptLong::NO_ARGUMENT], -+ ['--export-spam', GetoptLong::NO_ARGUMENT], -+ ['--export-probability', GetoptLong::NO_ARGUMENT], -+ ['--import-clean', GetoptLong::NO_ARGUMENT], -+ ['--import-spam', GetoptLong::NO_ARGUMENT], -+ ['--mbox', GetoptLong::NO_ARGUMENT], -+ ['--jtokenizer', '-j', GetoptLong::REQUIRED_ARGUMENT], -+ ['--method', '-m', GetoptLong::REQUIRED_ARGUMENT], -+ ['--spam-cutoff', GetoptLong::REQUIRED_ARGUMENT], -+ ['--mark-in-token', GetoptLong::REQUIRED_ARGUMENT], -+ ['--max-mail', GetoptLong::REQUIRED_ARGUMENT], -+ ['--min-mail', GetoptLong::REQUIRED_ARGUMENT], -+ ['--show-new-token', GetoptLong::NO_ARGUMENT], -+ ['--auto-update', '-a', GetoptLong::NO_ARGUMENT], -+ ['--update', '-u', GetoptLong::NO_ARGUMENT], -+ ['--add-clean', '-c', GetoptLong::NO_ARGUMENT], -+ ['--add-spam', '-s', GetoptLong::NO_ARGUMENT], -+ ['--sub-clean', '-C', GetoptLong::NO_ARGUMENT], -+ ['--sub-spam', '-S', GetoptLong::NO_ARGUMENT], -+ ['--disable-degeneration', '-D', GetoptLong::NO_ARGUMENT], -+ ['--disable-utf-8', GetoptLong::NO_ARGUMENT], -+ ['--ignore-body', '-B', GetoptLong::NO_ARGUMENT], -+ ['--refer-header', GetoptLong::REQUIRED_ARGUMENT], -+ ['--refer-all-header', GetoptLong::NO_ARGUMENT], -+ ['--ignore-header', '-H', GetoptLong::NO_ARGUMENT], -+ ['--ignore-plain-text-part', GetoptLong::NO_ARGUMENT], -+ ['--ignore-after-last-atag', GetoptLong::NO_ARGUMENT], -+ ['--pipe', GetoptLong::NO_ARGUMENT], -+ ['--insert-revision', GetoptLong::NO_ARGUMENT], -+ ['--insert-flag', GetoptLong::NO_ARGUMENT], -+ ['--insert-probability', GetoptLong::NO_ARGUMENT], -+ ['--header-prefix', GetoptLong::REQUIRED_ARGUMENT], -+ ['--mark-spam-subject', GetoptLong::NO_ARGUMENT], -+ ['--spam-subject-prefix', GetoptLong::REQUIRED_ARGUMENT], -+ ['--list-clean', GetoptLong::NO_ARGUMENT], -+ ['--list-spam', GetoptLong::NO_ARGUMENT], -+ ['--show-db-status', GetoptLong::NO_ARGUMENT], -+ ['--show-process', GetoptLong::NO_ARGUMENT], -+ ['--help', '-h', GetoptLong::NO_ARGUMENT], -+ ['--revision', GetoptLong::NO_ARGUMENT], -+ ['--quiet', '-q', GetoptLong::NO_ARGUMENT], -+ ['--debug', '-d', GetoptLong::NO_ARGUMENT], -+ ['--verbose', '-v', GetoptLong::NO_ARGUMENT] -+ ) -+ -+ allow_multi = { 'pop-proxy-set' => true } -+ - parser.quiet = true - begin - parser.each_option do |name, arg| - name.sub!(/^--/, '') - if (options[name] && allow_multi[name]) -- options[name] += ("," + arg) -+ options[name] += (',' + arg) - else - options[name] = arg.dup - end - end - rescue -- soft_raise(sprintf("#{$0}: %s", parser.error_message)) -+ soft_raise(format("#{$PROGRAM_NAME}: %s", parser.error_message)) - end - return options - end -- -- -+ - def get_options -- argv_backup = Marshal::load(Marshal::dump(ARGV)) # shallow copy is enough? -+ argv_backup = Marshal.load(Marshal.dump(ARGV)) # shallow copy is enough? - options = parse_command_line -- -- if (options["config-file"] && (! File::file?(options["config-file"]))) -- soft_raise(sprintf("#{$0}: can't open config file `%s'. check argument of --config-file\n", options["config-file"])) -+ -+ if (options['config-file'] && !File.file?(options['config-file'])) -+ soft_raise(format("#{$PROGRAM_NAME}: can't open config file `%s'. check argument of --config-file\n", -+ options['config-file'])) - end -- -- if (! options["homedir"]) -- if (ENV["BSFILTERHOME"]) -- options["homedir"] = ENV["BSFILTERHOME"] -- elsif (ENV["HOME"]) -- options["homedir"] = ENV["HOME"] + "/" + Default_homedir -- elsif (defined?(ExerbRuntime)) -- options["homedir"] = File.dirname(ExerbRuntime.filepath) -- else -- options["homedir"] = File.dirname($0) -- end -+ -+ unless (options['homedir']) -+ options['homedir'] = if (ENV['BSFILTERHOME']) -+ ENV['BSFILTERHOME'] -+ elsif (ENV['HOME']) -+ ENV['HOME'] + '/' + Default_homedir -+ elsif defined?(ExerbRuntime) -+ File.dirname(ExerbRuntime.filepath) -+ else -+ File.dirname($PROGRAM_NAME) -+ end - end -- -- if (! options["config-file"]) -- options["config-file"] = options["homedir"] + "/" + Default_conf_file -- end -- if (options["config-file"] && File::file?(options["config-file"])) -+ -+ options['config-file'] = options['homedir'] + '/' + Default_conf_file unless (options['config-file']) -+ if (options['config-file'] && File.file?(options['config-file'])) - ARGV.clear -- argv_config = read_config_file(options["config-file"]) -+ argv_config = read_config_file(options['config-file']) - (argv_config + argv_backup).reverse.each do |argv| - ARGV.unshift(argv) - end - options.update(parse_command_line) - end -- -- if (options["help"]) -+ -+ if (options['help']) - usage - exit 0 - end -- if (options["revision"]) -- print "bsfilter release #{Release} revision #{Revision}\n" -+ if (options['revision']) -+ print("bsfilter release #{Release} revision #{Revision}\n") - exit 0 - end -- -- options["homedir"] = options["homedir"].sub(/\/*$/, '') + "/" -- -- if (options["method"]) -- if (options["method"] !~ /\A(g|r|rf)\z/) -- soft_raise(sprintf("#{$0}: unsupported method `%s' for --method or -m\n", options["method"])) -+ -+ options['homedir'] = options['homedir'].sub(%r{/*$}, '') + '/' -+ -+ if (options['method']) -+ if (options['method'] !~ /\A(g|r|rf)\z/) -+ soft_raise(format("#{$PROGRAM_NAME}: unsupported method `%s' for --method or -m\n", options['method'])) - end - else -- options["method"] = Default_method -+ options['method'] = Default_method - end -- -- options["header-prefix"] = Default_header_prefix if (! options["header-prefix"]) -- options["spam-subject-prefix"] = Default_spam_subject_prefix if (! options["spam-subject-prefix"]) -- -- options["db"] = Default_db if (! options["db"]) -- case options["db"] -- when "sdbm" -+ -+ options['header-prefix'] = Default_header_prefix unless (options['header-prefix']) -+ options['spam-subject-prefix'] = Default_spam_subject_prefix unless (options['spam-subject-prefix']) -+ -+ options['db'] = Default_db unless (options['db']) -+ case options['db'] -+ when 'ndbm' -+ require 'dbm' -+ when 'sdbm' - require 'sdbm' -- when "gdbm" -+ when 'gdbm' - require 'gdbm' -- when "bdb1" -+ when 'bdb1' - require 'bdb1' -- when "bdb" -+ when 'bdb' - require 'bdb' -- when "qdbm" -+ when 'qdbm' - require 'depot' - else -- soft_raise(sprintf("#{$0}: unsupported argument `%s' for --db\n", options["db"])) -+ soft_raise(format("#{$PROGRAM_NAME}: unsupported argument `%s' for --db\n", options['db'])) - end -- -- if (options["jtokenizer"]) -- options["jtokenizer"].downcase! -+ -+ if (options['jtokenizer']) -+ options['jtokenizer'].downcase! - else -- options["jtokenizer"] = Default_jtokenizer -+ options['jtokenizer'] = Default_jtokenizer - end -- case options["jtokenizer"] -- when "bigram" -- when "block" -- when "mecab" -+ case options['jtokenizer'] -+ when 'bigram' -+ when 'block' -+ when 'mecab' - require 'MeCab' -- when "chasen" -+ when 'chasen' - require 'chasen.o' -- when "kakasi" -+ when 'kakasi' - require 'kakasi' - else -- soft_raise(sprintf("#{$0}: unsupported argument `%s' for --jtokenizer or -j\n", options["jtokenizer"])) -+ soft_raise(format("#{$PROGRAM_NAME}: unsupported argument `%s' for --jtokenizer or -j\n", options['jtokenizer'])) - end -- @jtokenizer = Jtokenizer::new(options["jtokenizer"]) -+ @jtokenizer = Jtokenizer.new(options['jtokenizer']) - --## if (options["unified-db"]) --## options["languages"] = [Default_Language] --## else --## options["languages"] = Languages --## end -+ ## if (options["unified-db"]) -+ ## options["languages"] = [Default_Language] -+ ## else -+ ## options["languages"] = Languages -+ ## end - -- options["languages"] = Languages -+ options['languages'] = Languages - -- options['mark-in-token'] = Default_mark_in_token if (! options['mark-in-token']) -- options['mark-in-token'].gsub!(/\s/, '') -- options["max-line"] = (options["max-line"] || Default_max_line).to_i -- options["max-mail"] = (options["max-mail"] || Default_max_mail).to_i -- options["min-mail"] = (options["min-mail"] || Default_min_mail).to_i -- -- options["degeneration"] = options["disable-degeneration"] ? false : true -+ options['mark-in-token'] = Default_mark_in_token unless (options['mark-in-token']) -+ options['mark-in-token'] = options['mark-in-token'].gsub(/\s/, '') -+ options['max-line'] = (options['max-line'] || Default_max_line).to_i -+ options['max-mail'] = (options['max-mail'] || Default_max_mail).to_i -+ options['min-mail'] = (options['min-mail'] || Default_min_mail).to_i - -- if (options["refer-header"]) -- array = options["refer-header"].downcase.split(',') -- elsif (options["ignore-header"]) -- array = Array::new -- else -- array = Default_refer_header.downcase.split(',') -- end -- options["refer-header"] = Hash::new -+ options['degeneration'] = options['disable-degeneration'] ? false : true -+ -+ array = if (options['refer-header']) -+ options['refer-header'].downcase.split(',') -+ elsif (options['ignore-header']) -+ [] -+ else -+ Default_refer_header.downcase.split(',') -+ end -+ options['refer-header'] = {} - array.each do |header| -- options["refer-header"][header] = true -+ options['refer-header'][header] = true - end - -- options["use-body"] = options["ignore-body"] ? false : true -- -- options["pid-file"] = options["homedir"] + Default_pid_file if (! options["pid-file"]) -- -- options["imap-auth"] = options["imap-auth"] || Default_imap_auth -- options["imap-auth-preference"] = Default_imap_auth_preference # can't modify with command line option -- -- if ((! options["disable-utf-8"])) -- options["utf-8"] = true -- else -- options["utf-8"] = false -- end -- -- if (options["pop"]) -+ options['use-body'] = options['ignore-body'] ? false : true -+ -+ options['pid-file'] = options['homedir'] + Default_pid_file unless (options['pid-file']) -+ -+ options['imap-auth'] = options['imap-auth'] || Default_imap_auth -+ options['imap-auth-preference'] = Default_imap_auth_preference # can't modify with command line option -+ -+ options['utf-8'] = if ((!options['disable-utf-8'])) -+ true -+ else -+ false -+ end -+ -+ if (options['pop']) - check_options_for_pop!(options) -- require 'timeout' -+ require 'timeout' - require 'socket' - setup_socket_timeout - end -- if (options["imap"]) -+ if (options['imap']) - check_options_for_imap!(options) - require 'net/imap' - setup_imap - end -- if (options["ssl"]) -- if (options["ssl-cert"]) -- if (! File::readable?(options["ssl-cert"])) -- soft_raise(sprintf("#{$0}: can't read %s. check --ssl-cert option", options["ssl-cert"])) -- end -+ if (options['ssl']) -+ if (options['ssl-cert']) && !File.readable?(options['ssl-cert']) -+ soft_raise(format("#{$PROGRAM_NAME}: can't read %s. check --ssl-cert option", options['ssl-cert'])) - end -- require "openssl" -+ require 'openssl' - setup_ssl_socket_timeout - end - return options - end -- -+ - def show_db_status -- @options["languages"].each do |lang| -- @db_hash[lang].clean.open("r") -- @db_hash[lang].spam.open("r") -- @db_hash[lang].prob.open("r") -- @options["message-fh"].printf("db %s %d %d %d %d %d\n", lang, -- @db_hash[lang].clean.size, -- @db_hash[lang].clean.file_count, -- @db_hash[lang].spam.size, -- @db_hash[lang].spam.file_count, -- @db_hash[lang].prob.size) -+ @options['languages'].each do |lang| -+ @db_hash[lang].clean.open('r') -+ @db_hash[lang].spam.open('r') -+ @db_hash[lang].prob.open('r') -+ @options['message-fh'].printf("db %s %d %d %d %d %d\n", lang, -+ @db_hash[lang].clean.size, -+ @db_hash[lang].clean.file_count, -+ @db_hash[lang].spam.size, -+ @db_hash[lang].spam.file_count, -+ @db_hash[lang].prob.size) - @db_hash[lang].prob.close - @db_hash[lang].spam.close - @db_hash[lang].clean.close - end - end -- -+ - def show_process(token_db, maintenance_command) -- if (@options["pop"]) -- prot = "pop" -- elsif (@options["imap"]) -+ if (@options['pop']) -+ prot = 'pop' -+ elsif (@options['imap']) - prot = "imap" - else - prot = "file" - end -- -+ - case token_db.spam_flag - when nil -- filter_result = "-" -+ filter_result = '-' - when true -- filter_result = "spam" -+ filter_result = 'spam' - when false -- filter_result = "clean" -+ filter_result = 'clean' - else -- raise "internal error: unknown spam_flag" -+ raise 'internal error: unknown spam_flag' - end -- -- @options["message-fh"].printf("%s %s %s %s %s %s %s\n", -- prot, -- token_db.language, -- filter_result, -- maintenance_command, -- token_db.time.strftime("%Y%m%d%H%M%S"), -- token_db.message_id, -- token_db.filename) -+ -+ @options['message-fh'].printf("%s %s %s %s %s %s %s\n", -+ prot, -+ token_db.language, -+ filter_result, -+ maintenance_command, -+ token_db.time.strftime('%Y%m%d%H%M%S'), -+ token_db.message_id, -+ token_db.filename) - end - - def spam? -@@ -3326,7 +3310,7 @@ EOM - def probability - @token_dbs.last.probability - end -- -+ - def setup(command_line_options) - @options.clear - @db_hash.clear -@@ -3334,163 +3318,166 @@ EOM - command_line_options_backup = command_line_options.dup - argv_backup = ARGV.dup - ARGV.clear -- if (! command_line_options_backup.empty?) -- ARGV.unshift(*command_line_options_backup) -- end -+ ARGV.unshift(*command_line_options_backup) unless command_line_options_backup.empty? - - @options.update(get_options) - -- STDIN::binmode -- if (@options["quiet"]) -- @options["message-fh"] = DevNull::new -- @options["pipe-fh"] = DevNull::new -- elsif (((@options["export-clean"] || @options["export-spam"] || @options["export-probability"]) && -- ((ARGV.length == 0) || (ARGV[0] == "-"))) || # export to stdout -- @options["list-clean"] || @options["list-spam"] || @options["pipe"]) -- @options["message-fh"] = STDERR -- @options["pipe-fh"] = STDOUT -- STDOUT::binmode -+ $stdin.binmode -+ if (@options['quiet']) -+ @options['message-fh'] = DevNull.new -+ @options['pipe-fh'] = DevNull.new -+ elsif (((@options['export-clean'] || @options['export-spam'] || @options['export-probability']) && -+ (ARGV.empty? || (ARGV[0] == '-'))) || # export to stdout -+ @options['list-clean'] || @options['list-spam'] || @options['pipe']) -+ @options['message-fh'] = $stderr -+ @options['pipe-fh'] = $stdout -+ $stdout.binmode - else -- @options["message-fh"] = STDOUT -- @options["pipe-fh"] = STDOUT -+ @options['message-fh'] = $stdout -+ @options['pipe-fh'] = $stdout - # keep STDOUT in text mode -- @options["message-fh"].sync = true -+ @options['message-fh'].sync = true - end -- -- @options['mark-in-token'] = Regexp::quote(@options['mark-in-token']) -- -- init_dir(@options["homedir"]) -- -- @options["languages"].each do |lang| -- case @options["method"] -+ -+ @options['mark-in-token'] = Regexp.quote(@options['mark-in-token']) -+ -+ init_dir(@options['homedir']) -+ -+ @options['languages'].each do |lang| -+ case @options['method'] - when 'rf' -- @db_hash[lang] = RobinsonFisher::new(@options, lang) -+ @db_hash[lang] = RobinsonFisher.new(@options, lang) - when 'r' -- @db_hash[lang] = Robinson::new(@options, lang) -+ @db_hash[lang] = Robinson.new(@options, lang) - when 'g' -- @db_hash[lang] = Graham::new(@options, lang) -+ @db_hash[lang] = Graham.new(@options, lang) - else -- raise sprintf("internal error: unknown method %s", @options["method"]) -+ raise format('internal error: unknown method %s', @options['method']) - end -- @db_hash[lang].spam_cutoff = @options["spam-cutoff"].to_f if (@options["spam-cutoff"]) -+ @db_hash[lang].spam_cutoff = @options['spam-cutoff'].to_f if (@options['spam-cutoff']) - end - - rest_options = ARGV.dup - ARGV.clear -- if (! argv_backup.empty?) -- ARGV.unshift(*argv_backup) -- end -+ ARGV.unshift(*argv_backup) unless argv_backup.empty? - - return rest_options - end - - def run(command_line_args) -- @options["message-fh"].print "start ", Time::new.to_s, "\n" if (@options["verbose"]) -- if (@options["show-db-status"]) -+ @options['message-fh'].print('start ', Time.new.to_s, "\n") if (@options['verbose']) -+ if (@options['show-db-status']) - show_db_status - return EXIT_NORMAL - end -- -- if (@options["pop"]) -- write_pid_file(@options["pid-file"]) -+ -+ if (@options['pop']) -+ write_pid_file(@options['pid-file']) - do_pop -- File::unlink(@options["pid-file"]) -+ File.unlink(@options['pid-file']) - return EXIT_NORMAL - end -- -+ - filtering_mode = true -- -- token_dbs = Array::new -+ -+ token_dbs = [] - @token_dbs = token_dbs -- if (@options["import-clean"] || -- @options["import-spam"] || -- @options["add-clean"] || -- @options["add-spam"] || -- @options["sub-clean"] || -- @options["sub-spam"]) -+ if (@options['import-clean'] || -+ @options['import-spam'] || -+ @options['add-clean'] || -+ @options['add-spam'] || -+ @options['sub-clean'] || -+ @options['sub-spam']) - filtering_mode = false -- if (command_line_args.empty? && ! @options["imap"]) -- token_dbs = update_token_dbs(["-"]) -+ if (command_line_args.empty? && ! @options['imap']) -+ token_dbs = update_token_dbs(['-']) - else - token_dbs = update_token_dbs(command_line_args) - end - end -- -- if (@options["export-clean"] || @options["export-spam"] || @options["export-probability"]) -+ -+ if (@options['export-clean'] || @options['export-spam'] || @options['export-probability']) - filtering_mode = false - do_export(command_line_args) - end -- -- if (@options["update"]) -+ -+ if (@options['update']) - filtering_mode = false -- @options["languages"].each do |lang| -- @db_hash[lang].clean.open("r") -- @db_hash[lang].spam.open("r") -+ @options['languages'].each do |lang| -+ @db_hash[lang].clean.open('r') -+ @db_hash[lang].spam.open('r') - @db_hash[lang].update_probability(token_dbs) # dbs = Array of TokenDB for -c, -s - @db_hash[lang].clean.close - @db_hash[lang].spam.close - end - end -- -+ - ret_code = CODE_NORMAL -- if (filtering_mode) -- @options["languages"].each do |lang| -- @db_hash[lang].prob.open("r") -+ if filtering_mode -+ @options['languages'].each do |lang| -+ @db_hash[lang].prob.open('r') - end -- if (@options["imap"]) -+ if (@options['imap']) - ret_code = do_imap(command_line_args, token_dbs) - else -- if (command_line_args.empty?) -- command_line_args = ["-"] -- end -- ret_code = CODE_CLEAN if (! @options["pipe"]) -+ command_line_args = ['-'] if command_line_args.empty? -+ ret_code = CODE_CLEAN unless (@options['pipe']) - command_line_args.each do |file| - open_ro(file) do |fh| - number = 1 -- mbox = Mbox::new(@options, fh) -+ mbox = Mbox.new(@options, fh) - while (buf = mbox.read) - token_db = tokenize_buf(buf) - token_db.filename = file - @db_hash[token_db.language].get_combined_probability(token_db) -- insert_headers!(buf, token_db.spam_flag, token_db.probability) -- if (@options["pipe"]) -- @options["pipe-fh"].print buf.join -- end -- printf("%s\n", file) if (token_db.spam_flag && @options["list-spam"]) -- printf("%s\n", file) if (! token_db.spam_flag && @options["list-clean"]) -- ret_code = CODE_SPAM if (token_db.spam_flag && (! @options["pipe"])) -+ insert_headers!(buf, token_db.spam_flag, token_db.probability) -+ @options['pipe-fh'].print buf.join if (@options['pipe']) -+ printf("%s\n", file) if (token_db.spam_flag && @options['list-spam']) -+ printf("%s\n", file) if (!token_db.spam_flag && @options['list-clean']) -+ ret_code = CODE_SPAM if (token_db.spam_flag && (!@options['pipe'])) - token_dbs.push(token_db) -- if (defined?(fh.path)) -- @options["message-fh"].printf("combined probability %s %d %f\n", -- fh.path, number, token_db.probability) -+ if defined?(fh.path) -+ @options['message-fh'].printf("combined probability %s %d %f\n", -+ fh.path, number, token_db.probability) - end - number += 1 - end - end - end - end -- @options["languages"].each do |lang| -+ @options['languages'].each do |lang| - @db_hash[lang].prob.close - end -- STDOUT::flush -- if (@options["auto-update"]) -- auto_update(token_dbs) -- elsif (@options["show-process"]) -+ $stdout.flush -+ if (@options['auto-update']) -+ auto_update(token_dbs) -+ elsif (@options['show-process']) - token_dbs.each do |token_db| -- show_process(token_db, "-") -+ show_process(token_db, '-') - end - end - end -- @options["message-fh"].print "end ", Time::new.to_s, "\n" if (@options["verbose"]) -- -+ @options['message-fh'].print('end ', Time.new.to_s, "\n") if (@options['verbose']) -+ - return ret_code - end - end - --if ($0 == __FILE__) -- bsfilter = Bsfilter::new -+class String -+ def to_utf8 -+ if (Bsfilter::LOG_CODESET) -+ return dup.encode(Bsfilter::LOG_CODESET, Encoding::EUC_JP, undef: :replace, invalid: :replace) -+ else -+ self -+ end -+ end -+end -+ -+ -+if ($PROGRAM_NAME == __FILE__) -+ bsfilter = Bsfilter.new - args = bsfilter.setup(ARGV) -- if (bsfilter.run(args)) -+ if bsfilter.run(args) - exit 0 - else - exit 1