-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdetectLang.js
35 lines (30 loc) · 10.2 KB
/
detectLang.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
const hiragana =
"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽっゃゅょ、。”「」";
const katakana =
"アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤ-ユ-ヨラリルレロワヲンガギグゲゴザジズゼゾダヂヅデドバビブベボパピプペポッャュョィェォー";
// some (3034) kanjis
const kanjis =
"日一十二人大年会国三本長中五出事社市者月四九同自政時業分上前生合行部地後議党八民六見間新員入場円学東発方高内百金七定子的対手立田回選今連県代開約力関体明山動万通目千全京実問決相米当度下主理表化調所小不取用治現法気経公話最野川家務制外区作性期設戦名来要意機言産成氏数権教題道保受勝総原思以強平協組島都多持支和加海続正安進売書水知工近指電心改物点北元文午記初報車府資首町世女挙院委査界団校予第画計判先西重向結岡込共品信局集活建引交反告利億済次件村考投減面派任解運際側広税省使増認付官求半別阪企朝策直男統職切福変店案屋得住木台始価無打昨在感費口聞情歳料藤論役示各営空参革州基流領死井容疑検古私特係格勢宮神式止置果放割裁少土演有線石語過終必転確両送状害提位研護収再消軍能比銀商輸崎警規由農常応育違夫士義身配館敗術説足施食補起乗球断残額導算南港証真究沢病談争急楽松落援製何率宅例備可訴構番張声美着担労優域難防幹庁武造佐与視副試質限路態景象技助郎準待験境親席形展頭戸谷働況仕葉早株監授量門票想然渡審課良伝英管条念愛横族被供医室映末負個器低若字追差値種鉄風申味整音退守注返環型太天客影財含望紙観買姿科響花復春述好去様園橋答隊察移児程細独光捕命推洋衛専失評帰白兵訪辺請呼離深師史処段材販積歩振修競婦号雄階根船賞赤歌浜火他満録丸夜悪融弁候河討単達林効署極系非針識久憲吉庫森並益接登衆撃券捜故閣城秋青母周韓編責寄模鮮図催右央走江障易清速博欧友秒督健休除渉飛献均存航介給蔵超座採殺従未色険富幅鹿旧読遺札激摘突黒写豊具司苦伊核療余週波完織類左講津顔駅就夏舞廃装降批否賀破紀脳債厳薬便歴抜危延冷興越馬異父劇拡逮貿及素覚頼盛習般占留罪級竹短等換将尾芸毎締寺丁植羽亡迎績旅岩盟喜沖陸標払替奈皇許因伸熱攻永継王散玉油妻属禁志静印維幸厚辞香湾暴圧勤抗倍背昭途戻貨帯順押曲岸徒崩源彼則輪精逆囲聴善刑池踏服適犯遅塁倉壊角僚略欠郡雨層償婚宿徳固震熊坂塚測賛倒旬普版停築脱執軽練康街浦避棄列損患載爆居募陣像傷漁乱卒迫絡遣闘星緊諸惑彦鳥跡之浮庭遠老雑狙削臨焼底為昇養儀努混樹温宣縄創絶郵密令恵契繰救届齢願恐遊掲息致納幕枚賃血更葬裏栄拠染岐抱需兆房刊著巨逃雇貴招隆迷汚仲抑草板希暮措刻併承災季誌欲借宇邦秀堂複骨里折瀬描純絵酒触訳鈴筋弘射弱択伴撤奏君扱痛秘詰節陽了緩託端功探盤巻却訟阜布笑困縮夕徴称閉奥盗懸飲章還誘弾貸我控智裕預贈緒傾掛薦仏慮燃肉束泉依埋症宗砂枠互駐敬群曜慎紹奪購夢項診簡律祭拒促片杉躍充免刺祉銃勧看圏譲仙甲埼仮壁典操御悩宝銭荷顧稲俊沿臓衝誉豪排華孝兼雪唱雅鋼照隠剤謝撮渋携巡吹殿忠晴冬茂揮徹捨薄獲戒腐柱飯敏堀透勉駆隣犠妥敷滞械貯茶菱魚悲荒吸誤握籍己句袋誠包範揺筆暫析綱梅揚窓床潟侵芝茨哲翌柳騒暗索襲娘網塩挑郷喪到緑誕弟浩堅皮裂双畑紛貢舎衣賄童快訓肝毛礼炭濃寝洗驚也慣昼歯臣柄宏滑仁至忘微歓焦掘克抵泊似鉱液牛括嶋鳴趣阿酸胸乳較距斎偽杯斉妙既浅飾雲筑即湖硬威露閥覇矢潮鎖祝琴邸釈慶是旗桜封旭煕浴賠炎昔宙牧兄詳寿柔繁麻滋肩慰腕貫腹朗撲滅犬尊鶴縦泳垣須旨幼賢詩懇昌貞泰菜翼畿嫌毒励輝添甘幌沈蓄脅刷黄沼湯諭舗炉箱竜岳剣摩砲干敵珍栃懲錦稼罰縁虫喚踊鯨勇丘陳陥祖俳枝牲梶腰怒耳龍浪煙膨覧棟丹丈那肥簿軸魅梨穂繊帝穴稿啓尽倫往糸曽卵聖礎絞頑偏貧遇漏零序孤没亀狭耐恒豆浄駒瞬拓郊彩糧塾飼李菊脚冊鏡玄粉殖廷軒墓寒怖拘頂幡噴欄眠僕疲擁悟恩憶皆誇孫殊溶軟寛覆嘉卓拍煮亜慢芳凍俣彰曙胞彫匹菌径輩黙培恋泣尋磨潜剰鑑紅眼笠陰阻諮灯遂髪尚弥概雰剛銘掃缶耕霊陶脇伏熟喫渕卸尻磁妹帳栗艦浸湿漫冒訂刈壇淡姉祥涙尼筒庄忙虚披暖征唯謀叫咲奇妨晃猶屈悔釣晩肺媛暇嵐跳脈辻柴隔巣騰栽俵奨秩塗祈菅唆随抽衡据斜桑荘鬼乾佳粘拝隻猛漢棒砕糖架孜脂鋭盾緯苗姫堤紫銅狂祐勲洲氷篠翻淳癒麦紡軌翔嘆衰澤徐靖懐堺粒貝冨逸疎虐才陛刀汁惨葛粧准柏詐俗滝酬胆紋稚芦綿漂溝瑞雷奉嫁珠殴班伎詞烈堰鎌棚釧潤穫冠墨霧墳蒸拾峰邪陵艇鈍寮泥奮棋亭幻恥姓粛廊郭妃潔穏悦欺灰磯桂皿疫伯駄妊凶舟巧酔仰晶汗淀膜菓魔辰涯謙盆毅唐釜澄萩牟靴勘乏髄吾洞笹誓遭宜劣矛桐芽輔碑獄峡擦肌顕鎮偶把椿坊搬符敦傍垂鉛粗暑后宰塔稔壮刃縫鳩鐘弓呉机尿惜霞隅朴臭猫辛謡悼壌鼻畳累狩寂赴膚炊忍穀酷荻帽亮庶恭鷹怠弦匠傘拐寸憂瓶胃諾函頻枯履槽閲錯哉魂虜羅僧肢怪鍋盲灘践圭鉢呂獣宴蒲鳳桃麗乃呈吐裸窒囚紳循綾幣鼓該遷搭坪漠猪腎飢琵寧朱琶窮憾蘭鴨畠如舶偉濯暦騎俸樋楠娠旋殻涼腸畜鶏粋召虎藩漬洪縛昆薫眺赦弊苑蘇郁磐沸琉哀凡槻扇慈伐蓮巳湊椎濁燥媒奄墜愚苫嶺酵轄晋娯幾蛇鍛枢篤疾串朽紘憤扉柿泡碁掌飽伍亘辱曇又戯悠賓譜酢巴曹弔憎餓笛挟凝瓦慨憩豚窃挫淵聡嗣淑恨崇礁斐舌痴摂享岬扶樺匿偵錠渦紺猟猿陪甚塊窯剖鷲肇叙窪倶喝胴嘱尺鵬屯冗箕朋亨硫濠橘盧汽坑栖駿楼榎唄鄭醸廉酪雀款國遮傑茅乙喬睦睡廣播肖絹暁胎樫詠忌臼肯苅遍帆琢樽秦挿寅薩沙敢巌屏舘峯粕斗劉楊佑鋳禅腫蕪湘玲羊闇肪桧趙梁嶽奔瘍塀痢隈諏擬吟漆蔭棺峠鴻閧眞愉汎蕉魁騨奴爪芭厘此抹欣閑禍漱暢侍芥蒼渓糾鵜滴胡茎鍵尉杵翁慌匡迅塞僑曳逝烏惧癖洸痕聯禎祇藻勅卑拳錬硝爾牡條逓庵姜慕隼謎綻渇藍橿崔潰桟榊嬢蛍隷愁腺獅遼珂鞍誰婿枕姑斡畔袖蚕孔妖栓揖呆繕拙襟庸粟萬其虹杜餅迭哨謹泌蛭渥祀幽蝶瀧稜蜂碧叔酌兜姻洛頓雫翫壱耗矯蟹燕升殉雌嚇凸嫡翠籠楢租斥鯉允涛賭宋濱簗醜侮峻褒煥禄暉壽薮孟賊唇玩瑠霜芹馨蚊魏眉宍彬蛮蹴慧斑巽搾股逢冥堪叡貌厄蒙麿拭寡甫堆櫛脊稽倭佃雁旺摯鉾箸逐實芙罷銚捉采萌逗讃鷺鐸凹葵檜璃賜薗瑛拷牽湧堕濤圓椋竪陀呪侯鮎嵯麹峨荏附鍾赳啄廠骸諌褐牙蜜杷榛鴎邨綜漕膳迦鎔麓畷嬉桶脩菩埴錫瀋壕玖壷詣於杏堯尹汰惣伺鯖曝杖譚皓煩埜宵朔舷徽葦黎狼柘爵汐矩欽礒疆侶捻衷邱焉妄鋪蕃檀惰董伽笘冤惇瞳戴噌卿魯箔匂芯抄儒斬頃蓋胤團鰐弄瞭邑冶婆痘綬鮫惟榮恣倣灸蹊賦枇淘挺帥叱蒔閤狐饗凱芋汪蔑槌旦餌諫舵狛杭祢堵帖畝碓梓壺篭某鋒菰艶賈浙拉麺桝篇韮綴銑狗韻佛繭廟酉彌已璧晟溜蓉輿塵丞昏梗襄藪婁但槍厨蒋紗縞窟夷聰洙捺宕兎酎湛爽薪絢絆嵜囃撫詔喧桁萱頸與旛捧舜而倹頴噂釉諧蜷娩沌悌諜坦只卯舩冴袴崖云齊勃薙醤丼歪渚坐訣怨嘘蕭毬巖碗鱗葺謄諦摺嘗戎鍼邊膠噺娼讐弧畦癌丑莉緻傳砺廻瓜鰻彗牌醍塑漸穣砦醐沓迪虔腱洽櫻姪鱒罵樟柵腔巷喉鞠俺鸞嘲賂呑填舛濶禧詫吏涌蔓汀碩墾箇雉衞禹檄楸乖蕨耀耶瓢燈溺鯛楚蝉稀箏禮拗傲唾捷鵠硯飴綺昴怜瀕剥妬梯裾嵩梢漉寓喰辜笙揆拮峙們螺牝昧丙挽莫疏夙嫉粥蝦畏袁蓼蔡滉鬱幇凰麟爺膿廿凧脆雛梱昂姶簑筌癬涵濫斌庇塙壬蛙厩或茜羞齋澁曼琳濡剃叢棲錘菖偲纂拶庚茄挨蛛莽炳溥榜惠彭咸偕傭糞釘蔦蜘燦絃仇咽琿殷杞揄揶罹圀嗜俑牢彪套捗佼袈姦虻鞆蛯薇茫筰瞑煌掟嶌櫓蓑吠憧砥撰趨裟垢侠砧臥陜鐵躊躇謳訃薔肛渾泄斂哈藁楓紐梼凋黛煎凄椙疹蟻鴈饒藝笏琲珈洒椒扁奎哺匯椀蝋憐緬祷轍醒楯咋犀乞汲劾鴬穎邁磋溪棹曖慄寇勁侑聾簾苓頬鞭疋毘尖鉦餐尭瞠疱烋崑毀凌蒜頒鎧臆溢亥椅槇闊鎬葭筍瞰澪暎憑凛柚蓬埠馴腿揃羨屍糠叶甥鄒胚總筝矮瞞甦珀游晰雙剱猷頁錨氾嚢汝吊叩曾戚斯坤砿鍬妓莞侃迂邉辣萠茗篆礫滓梟晧噪喩燮兪籾斧柊鐙貼紬痩鴫碕虞巾駕鯰靱蘆荼絲筧祠琥爛戌巍咤叟儲豹弼稗緋碇蛋苔楕遡詮裳藏耿糀筥瘤潴潘枦晨嘔伶沃鼎馳箪遜惹忽繋禽誼靡顯躬贅蜃糺秉猥熾杣孵亙諒瞥焚栂杓匙甑轟謁洩姥穐鬆隕閔鉉邇萍舫聚紆籔瓊榕恂彷彙巫儡傀煉燐淋悶蕗鏑寵耽蛸蹟妾竣勾糊鰍蛎苛裔蟇膀胱罠篁疥當璋獨猴愕哭勣于勿亦膝誹秤岱笥蝕燭膏倦頚桔碍襖瑤韶霍鐐賤謗諍臚腑聶聘經籃炯榴楷梵將學娜俯豫慾圃扮楳遁瀞禿涜嬬鞘鮭惚劫怯鋸竿盈";
const japaneseChar = "[" + hiragana + katakana + kanjis + "]";
const reJap = new RegExp(japaneseChar, "g");
// var notJapaneseChar = "[^" + hiragana + katakana + kanjis + "]";
// var reNotJap = new RegExp(notJapaneseChar, "g");
module.exports = function (text, japPropThresh = 0.5, verbose = false) {
if (verbose) {
console.log("detect language for: " + text);
}
// remove urls
text = text.replace(/(?:https?|ftp):\/\/[\n\S]+/g, "");
var propJap = (text.match(reJap) || []).length / text.length;
if (propJap >= 0.05) {
if (verbose) {
console.log("Japanese detected");
}
return "JA";
} else {
if (verbose) {
console.log("Japanese NOT detected");
}
return "EN";
}
};