From e14053143acb53b5ee953f405df56c7164e915a7 Mon Sep 17 00:00:00 2001 From: Masahiko Ito Date: Mon, 9 May 2016 00:09:20 +0900 Subject: [PATCH] 2016/05/09 00:09:20 --- README | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++ sf_add.sh | 128 ++++++++++++++++++++++++++++++++++++++++++ sf_check.sh | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++ sf_del.sh | 133 +++++++++++++++++++++++++++++++++++++++++++ sf_init.sh | 54 ++++++++++++++++++ 5 files changed, 627 insertions(+) create mode 100644 README create mode 100755 sf_add.sh create mode 100755 sf_check.sh create mode 100755 sf_del.sh create mode 100755 sf_init.sh diff --git a/README b/README new file mode 100644 index 0000000..ce08e01 --- /dev/null +++ b/README @@ -0,0 +1,158 @@ + + sf-0.1 -- spam filter for UNIX-like systems + +Copyright (C) 2006 Masahiko Ito + +These programs is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. + +These programs is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +these programs; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +Mail suggestions and bug reports for these programs to +"Masahiko Ito" + +History +======= + + ・ 2006/10/10 Ver. 0.1 release (1st) + +What's this ? +============= + +ベイズ理論を理解しづらいと感じたお馬鹿な私が、もっとシンプルな考え方でも効果が +見込めるのでは?と思い、自分なりの仮説に基づいてフィルタを実装してみたら、結構 +いけてしまったという代物。 + +Preinstall +========== + +sf-0.1は、以下のソフトウェアを利用しています。事前にインストールが必要です。 + + ・ KAKASI - 漢字→かな(ローマ字)変換プログラム (http://kakasi.namazu.org/) + ・ SQLite home page (http://www.sqlite.org/) + +Install +======= + + ・ tar xvzf sf-0.1.tar.gz + ・ cd sf-0.1 + ・ cp sf-*.sh /anywhere/bin/ + ・ mkdir ~/.sf + +Algorithm +========= + +非spam学習テーブル(以下whiteテーブル)は非spamメール中の「単語」と「出現回数」を +保持し、 spam学習テーブル(以下blackテーブル)はspamメール中の「単語」と「出現回 +数」を保持します。 + +create table t_white ( + term text primary key, + count long long int +); +create table t_black ( + term text primary key, + count long long int +); + +あるメールに関して検査を行う場合、まずそのメール本文を「単語(tango1〜n)」に分解 +し、それぞれの「出現回数(count1〜n)」をカウントします。 + +「単語(tango1)」を検索キーにしてwhiteテーブルを検索し「whiteテーブル中の出現回 +数」x「出現回数(count1)」/「whiteテーブル中の全単語の出現回数の総和」を求めます +(white_score)。 + +「単語(tango1)」を検索キーにしてblackテーブルを検索し「blackテーブル中の出現回 +数」x「出現回数(count1)」/「blackテーブル中の全単語の出現回数の総和」を求めます +(black_score)。 + +(white_score / (white_score + black_score)) - 0.5 を求め、これを「単語(tango1) +」に対する「スコア」とする。「スコア」は-0.5〜+0.5の値を取り、マイナス値はspam +傾向が高く、プラス値は非spam傾向が高い単語であることを意味する。 + +以下、同様に残りの「単語」についても「スコア」を求め、「単語(tango1〜n)」の全「 +スコア」を合計し、この合計値がマイナス値ならばspamと判断する。 + +How to use +========== + +sf_init.sh +---------- + +$ sf_init.sh -h +Usage : sf_init.sh +Initialize database. + +spamの判断に利用するデータベースを初期化します。システムを利用する一番最初に一 +度だけ実行します。 + +sf_add.sh +--------- + +$ sf_add.sh -h +Usage : sf_add.sh [-w|--white|-b|--black] [-v|--vacuum] [file ...] +Add data to database. + -w, --white add data to white database. + -b, --black add data to black database. + -v, --vacuum vacuum after add. + +データベースの学習(追加)を行います。日々やって来るspamのうち、正しくspamと判断 +出来なかった物を-bオプションで学習させます。また、非spamのうち、誤って spamと判 +断された物を-wオプションで学習させます。システムの利用前に、spam、非spam をそれ +ぞれ100メール程度学習させておけば、90%以上(?)の精度で振り分け出来ます。 + +sf_del.sh +--------- + +$ sf_del.sh -h +Usage : sf_del.sh [-w|--white|-b|--black] [-v|--vacuum] [file ...] +Del data from database. + -w, --white add data to white database. + -b, --black add data to black database. + -v, --vacuum vacuum after del. + +データベースの学習(削除)を行います。学習ミスを取り消す場合に利用します。 + +sf_check.sh +----------- + +$ sf_check.sh -h +Usage : sf_check.sh [-w|--white|-b|--black] [file ...] +Check file. + -w, --white check white? + -b, --black check black? +return 0 when check is true. +return 1 when check is false. + +入力ファイル(またはstdin)の内容を検査し、検査スコア(実数値)をstdoutに出力後、検 +査結果が真なら0、偽なら1を返します。検査スコアは、spamの場合マイナス値となり、 +spamでない場合は0.0以上の値となります。無学習の場合の検査スコアは常に 0.0以上と +なります。 + +procmailとの連係 +================ + +実際のspam振り分けに関してはprocmailとの連係が良いと思います。 + +$ cat ~/.procmailrc +:0 HB +* ? sf_check.sh -b +/home/ほげ/Mail/spam/. + +sf_check.shによりspam(-b)である事を検査し、真であれば/home/ほげ/Mail/spam/にメ +ールを格納します。 + +BUGS +==== + + ・ スクリプトに渡すデータに複数の文字コードが混在している場合は正しい判断が出 + 来ないかもしれません。 + diff --git a/sf_add.sh b/sf_add.sh new file mode 100755 index 0000000..f46954c --- /dev/null +++ b/sf_add.sh @@ -0,0 +1,128 @@ +#! /bin/sh +# +# spam filter programs by m-ito@myh.no-ip.org +# +#---------------------------------------------------------------------- +# +# functions +# +function show_help () { + echo "Usage : $0 [-w|--white|-b|--black] [-v|--vacuum] [file ...]" + echo "Add data to database." + echo " -w, --white add data to white database." + echo " -b, --black add data to black database." + echo " -v, --vacuum vacuum after add." +} +#---------------------------------------------------------------------- +# +# main routin +# +if [ "X$1" = "X-h" -o "X$1" = "X--help" ] +then + show_help + exit 0 +fi +# +if [ "X${SFDIR}" = "X" ] +then + SFDIR=${HOME}/.sf +fi +# +if [ "X${SFDB}" = "X" ] +then + SFDB=sf.db +fi +# +SFDB_PATH=${SFDIR}/${SFDB} +# +maxlength=50; export maxlength +tab=`echo -n -e '\t'` +zsp=`echo -n -e '\241\241'` +# +table="" +file="" +vacuum="" +# +while [ $# != 0 ] +do + case $1 in + -w|--white ) + table="t_white" + ;; + -b|--black ) + table="t_black" + ;; + -v|--vacuum ) + vacuum="vacuum;" + ;; + * ) + file="${file} $1" + ;; + esac + shift +done +# +if [ "X${table}" = "X" ] +then + show_help + exit 0 +fi +# +for i in `cat ${file} |\ + nkf -e -X |\ + kakasi -w -ieuc -oeuc |\ + sed -e "s/${zsp}/ /g;s/${tab}/ /g" |\ + awk '{gsub(/ /,"\n");print}' |\ + awk 'BEGIN{ \ + maxlength = ENVIRON["maxlength"]; \ + } \ + { \ + if (length($0) <= maxlength){ \ + print; \ + } \ + }' |\ + tr -d '"'` +do + echo -n $i | tr -d '[:cntrl:]' + echo "" +done >/tmp/sf_add.1.$$.tmp +# +echo "begin;" >/tmp/sf_add.2.$$.tmp +# +for i in `cat /tmp/sf_add.1.$$.tmp |\ + sort |\ + uniq -c |\ + sed -e "s/^ *//;s/${tab}/,/"` +do + count=`echo $i | cut -d, -f1` + term=`echo $i | cut -d, -f2-` + if [ "X${term}" = "X" ] + then + : # do nothing + else + result=`echo "select term from ${table} where term=\"${term}\";" |\ + sqlite3 ${SFDB_PATH}` + if [ "X${result}" = "X" ] + then + echo "insert into ${table} values (\"${term}\",${count});" >>/tmp/sf_add.2.$$.tmp + else + echo "update ${table} set count=count+${count} where term=\"${term}\"; " >>/tmp/sf_add.2.$$.tmp + fi + fi +done +# +echo "end;" >>/tmp/sf_add.2.$$.tmp +cat /tmp/sf_add.2.$$.tmp |\ +sqlite3 ${SFDB_PATH} +# +result=`echo "select sum(count) from ${table};" |\ + sqlite3 ${SFDB_PATH}` +echo "update t_total set count=${result} where tablenm=\"${table}\";" |\ +sqlite3 ${SFDB_PATH} +# +echo ${vacuum} |\ +sqlite3 ${SFDB_PATH} +# +rm /tmp/sf_add.*.$$.tmp +# +exit 0 diff --git a/sf_check.sh b/sf_check.sh new file mode 100755 index 0000000..839c21b --- /dev/null +++ b/sf_check.sh @@ -0,0 +1,154 @@ +#! /bin/sh +# +# spam filter programs by m-ito@myh.no-ip.org +# +#---------------------------------------------------------------------- +# +# functions +# +function show_help () { + echo "Usage : $0 [-w|--white|-b|--black] [file ...]" + echo "Check file." + echo " -w, --white check white?" + echo " -b, --black check black?" + echo "return 0 when check is true." + echo "return 1 when check is false." +} +#---------------------------------------------------------------------- +# +# main routin +# +if [ "X$1" = "X-h" -o "X$1" = "X--help" ] +then + show_help + exit 0 +fi +# +if [ "X${SFDIR}" = "X" ] +then + SFDIR=${HOME}/.sf +fi +# +if [ "X${SFDB}" = "X" ] +then + SFDB=sf.db +fi +# +SFDB_PATH=${SFDIR}/${SFDB} +# +maxlength=50; export maxlength +tab=`echo -n -e '\t'` +zsp=`echo -n -e '\241\241'` +# +table="" +file="" +# +while [ $# != 0 ] +do + case $1 in + -w|--white ) + table="t_white" + ;; + -b|--black ) + table="t_black" + ;; + * ) + file="${file} $1" + ;; + esac + shift +done +# +if [ "X${table}" = "X" ] +then + show_help + exit 0 +fi +# +w_total=`echo "select count from t_total where tablenm=\"t_white\";" |\ + sqlite3 ${SFDB_PATH}` +b_total=`echo "select count from t_total where tablenm=\"t_black\";" |\ + sqlite3 ${SFDB_PATH}` +total_score=0.0 +# +for i in `cat ${file} |\ + nkf -e -X |\ + kakasi -w -ieuc -oeuc |\ + sed -e "s/${zsp}/ /g;s/${tab}/ /g" |\ + awk '{gsub(/ /,"\n");print}' |\ + awk 'BEGIN{ \ + maxlength = ENVIRON["maxlength"]; \ + } \ + { \ + if (length($0) <= maxlength){ \ + print; \ + } \ + }' |\ + tr -d '"'` +do + echo -n $i | tr -d '[:cntrl:]' + echo "" +done >/tmp/sf_check.1.$$.tmp +# +for i in `cat /tmp/sf_check.1.$$.tmp |\ + sort |\ + uniq -c |\ + sed -e "s/^ *//;s/${tab}/,/"` +do + count=`echo $i | cut -d, -f1` + term=`echo $i | cut -d, -f2-` +# + if [ "X${term}" = "X" ] + then + : # do nothing + else + w_count=`echo "select count*${count} from t_white where term=\"${term}\";" |\ + sqlite3 ${SFDB_PATH}` + if [ "X${w_count}" = "X" ] + then + w_count=0 + fi +# + b_count=`echo "select count*${count} from t_black where term=\"${term}\";" |\ + sqlite3 ${SFDB_PATH}` + if [ "X${b_count}" = "X" ] + then + b_count=0 + fi +# + if [ ${w_count} = "0" -a ${b_count} = "0" ] + then + : # do nothing + else + score=`echo "scale=10;((${w_count}/${w_total}) / ((${b_count} / ${b_total}) + (${w_count} / ${w_total}))) - 0.5" | bc` + total_score=`echo "scale=10;${total_score} + ${score}" | bc` + fi + fi +done +# +rm /tmp/sf_check.*.$$.tmp +# +echo ${total_score} +# +if [ ${table} = "t_white" ] +then + echo ${total_score} |\ + awk '{ + if ($0 >= 0){ + exit 0; + }else{ + exit 1; + } + }' +else + echo ${total_score} |\ + awk '{ + if ($0 >= 0){ + exit 1; + }else{ + exit 0; + } + }' +fi +# +exit $? diff --git a/sf_del.sh b/sf_del.sh new file mode 100755 index 0000000..dce9647 --- /dev/null +++ b/sf_del.sh @@ -0,0 +1,133 @@ +#! /bin/sh +# +# spam filter programs by m-ito@myh.no-ip.org +# +#---------------------------------------------------------------------- +# +# functions +# +function show_help () { + echo "Usage : $0 [-w|--white|-b|--black] [-v|--vacuum] [file ...]" + echo "Del data from database." + echo " -w, --white add data to white database." + echo " -b, --black add data to black database." + echo " -v, --vacuum vacuum after del." +} +#---------------------------------------------------------------------- +# +# main routin +# +if [ "X$1" = "X-h" -o "X$1" = "X--help" ] +then + show_help + exit 0 +fi +# +if [ "X${SFDIR}" = "X" ] +then + SFDIR=${HOME}/.sf +fi +# +if [ "X${SFDB}" = "X" ] +then + SFDB=sf.db +fi +# +SFDB_PATH=${SFDIR}/${SFDB} +# +maxlength=50; export maxlength +tab=`echo -n -e '\t'` +zsp=`echo -n -e '\241\241'` +# +table="" +file="" +vacuum="" +# +while [ $# != 0 ] +do + case $1 in + -w|--white ) + table="t_white" + ;; + -b|--black ) + table="t_black" + ;; + -v|--vacuum ) + vacuum="vacuum;" + ;; + * ) + file="${file} $1" + ;; + esac + shift +done +# +if [ "X${table}" = "X" ] +then + show_help + exit 0 +fi +# +for i in `cat ${file} |\ + nkf -e -X |\ + kakasi -w -ieuc -oeuc |\ + sed -e "s/${zsp}/ /g;s/${tab}/ /g" |\ + awk '{gsub(/ /,"\n");print}' |\ + awk 'BEGIN{ \ + maxlength = ENVIRON["maxlength"]; \ + } \ + { \ + if (length($0) <= maxlength){ \ + print; \ + } \ + }' |\ + tr -d '"'` +do + echo -n $i | tr -d '[:cntrl:]' + echo "" +done >/tmp/sf_del.1.$$.tmp +# +echo "begin;" >/tmp/sf_del.2.$$.tmp +# +for i in `cat /tmp/sf_del.1.$$.tmp |\ + sort |\ + uniq -c |\ + sed -e "s/^ *//;s/${tab}/,/"` +do + count=`echo $i | cut -d, -f1` + term=`echo $i | cut -d, -f2-` + if [ "X${term}" = "X" ] + then + : # do nothing + else + result=`echo "select count from ${table} where term=\"${term}\";" |\ + sqlite3 ${SFDB_PATH}` + if [ "X${result}" = "X" ] + then + : # do nothing + else + if [ ${result} -le ${count} ] + then + echo "delete from ${table} where term=\"${term}\"; " >>/tmp/sf_del.2.$$.tmp + else + echo "update ${table} set count=count-${count} where term=\"${term}\"; " >>/tmp/sf_del.2.$$.tmp + fi + fi + fi +done +# +echo "end;" >>/tmp/sf_del.2.$$.tmp +cat /tmp/sf_del.2.$$.tmp |\ +sqlite3 ${SFDB_PATH} +# +result=`echo "select sum(count) from ${table};" |\ + sqlite3 ${SFDB_PATH}` +echo "update t_total set count=${result} where tablenm=\"${table}\";" |\ +sqlite3 ${SFDB_PATH} +# +echo ${vacuum} |\ +sqlite3 ${SFDB_PATH} +# +rm /tmp/sf_del.*.$$.tmp +# +exit 0 diff --git a/sf_init.sh b/sf_init.sh new file mode 100755 index 0000000..3b7dac3 --- /dev/null +++ b/sf_init.sh @@ -0,0 +1,54 @@ +#! /bin/sh +# +# spam filter programs by m-ito@myh.no-ip.org +# +#---------------------------------------------------------------------- +# +# functions +# +function show_help () { + echo "Usage : $0" + echo "Initialize database." +} +#---------------------------------------------------------------------- +# +# main routin +# +if [ "X$1" = "X-h" -o "X$1" = "X--help" ] +then + show_help + exit 0 +fi +# +if [ "X${SFDIR}" = "X" ] +then + SFDIR=${HOME}/.sf +fi +# +if [ "X${SFDB}" = "X" ] +then + SFDB=sf.db +fi +# +SFDB_PATH=${SFDIR}/${SFDB} +# +rm -f ${SFDB_PATH} +# +sqlite3 ${SFDB_PATH} <<__EOF__ +create table t_white ( + term text primary key, + count long long int +); +create table t_black ( + term text primary key, + count long long int +); +create table t_total ( + tablenm text primary key, + count long long int +); +insert into t_total values("t_white",0); +insert into t_total values("t_black",0); +__EOF__ +# +exit 0