From 70a24431aa290a9fe3fb9c189335638fb3bd4e78 Mon Sep 17 00:00:00 2001 From: Adek Date: Wed, 1 Jun 2022 00:45:07 +0700 Subject: [PATCH] SpamPrediction: use Page Segmentation Mode 6 for tesseract OCR Change-Id: If8965154d05e234a69795dcc242a15dce59ab897 --- anjani/plugins/spam_prediction.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/anjani/plugins/spam_prediction.py b/anjani/plugins/spam_prediction.py index 4cf4d6bde..2bbc8ef6e 100644 --- a/anjani/plugins/spam_prediction.py +++ b/anjani/plugins/spam_prediction.py @@ -142,8 +142,8 @@ async def _predict(self, text: str) -> util.types.NDArray[float]: async def _is_spam(self, text: str) -> bool: return (await util.run_sync(self.model.predict, [text]))[0] == "spam" - async def runOcr(self, message: Message) -> Optional[str]: - """Read image text""" + async def run_ocr(self, message: Message) -> Optional[str]: + """Run tesseract""" try: image = AsyncPath(await message.download()) except Exception: # skipcq: PYL-W0703 @@ -154,7 +154,15 @@ async def runOcr(self, message: Message) -> Optional[str]: ) try: - stdout, _, exitCode = await util.system.run_command("tesseract", str(image), "stdout") + stdout, _, exitCode = await util.system.run_command( + "tesseract", + str(image), + "stdout", + "-l", + "eng+ind", + "--psm", + "6" + ) except Exception as e: # skipcq: PYL-W0703 return self.log.error("Unexpected error occured when running OCR", exc_info=e) finally: @@ -332,7 +340,7 @@ async def on_message(self, message: Message) -> None: else (message.caption.strip() if message.media and message.caption else None) ) if message.photo: - future = self.bot.loop.create_task(self.runOcr(message)) + future = self.bot.loop.create_task(self.run_ocr(message)) future.add_done_callback( partial(self.bot.loop.call_soon_threadsafe, self._check_spam_results_ocr, message) ) @@ -562,7 +570,7 @@ async def cmd_spam(self, ctx: command.Context) -> Optional[str]: content = ctx.input if reply_msg and reply_msg.photo: - ocr_result = await self.runOcr(reply_msg) + ocr_result = await self.run_ocr(reply_msg) if ocr_result: try: await self.mark_spam_ocr(ocr_result, user_id, ctx.chat.id, reply_msg.id) @@ -643,7 +651,7 @@ async def cmd_predict(self, ctx: command.Context) -> Optional[str]: reply_to_message_id=replied.id, ) - ocr_result = await self.runOcr(replied) + ocr_result = await self.run_ocr(replied) if ocr_result: ocr_prediction = await self._predict(ocr_result) if ocr_prediction.size != 0: