Skip to content

Commit

Permalink
SpamPrediction: use Page Segmentation Mode 6 for tesseract OCR
Browse files Browse the repository at this point in the history
Change-Id: If8965154d05e234a69795dcc242a15dce59ab897
  • Loading branch information
adekmaulana committed May 31, 2022
1 parent 1bec2ac commit 70a2443
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions anjani/plugins/spam_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ async def _predict(self, text: str) -> util.types.NDArray[float]:
async def _is_spam(self, text: str) -> bool:
return (await util.run_sync(self.model.predict, [text]))[0] == "spam"

async def runOcr(self, message: Message) -> Optional[str]:
"""Read image text"""
async def run_ocr(self, message: Message) -> Optional[str]:
"""Run tesseract"""
try:
image = AsyncPath(await message.download())
except Exception: # skipcq: PYL-W0703
Expand All @@ -154,7 +154,15 @@ async def runOcr(self, message: Message) -> Optional[str]:
)

try:
stdout, _, exitCode = await util.system.run_command("tesseract", str(image), "stdout")
stdout, _, exitCode = await util.system.run_command(
"tesseract",
str(image),
"stdout",
"-l",
"eng+ind",
"--psm",
"6"
)
except Exception as e: # skipcq: PYL-W0703
return self.log.error("Unexpected error occured when running OCR", exc_info=e)
finally:
Expand Down Expand Up @@ -332,7 +340,7 @@ async def on_message(self, message: Message) -> None:
else (message.caption.strip() if message.media and message.caption else None)
)
if message.photo:
future = self.bot.loop.create_task(self.runOcr(message))
future = self.bot.loop.create_task(self.run_ocr(message))
future.add_done_callback(
partial(self.bot.loop.call_soon_threadsafe, self._check_spam_results_ocr, message)
)
Expand Down Expand Up @@ -562,7 +570,7 @@ async def cmd_spam(self, ctx: command.Context) -> Optional[str]:
content = ctx.input

if reply_msg and reply_msg.photo:
ocr_result = await self.runOcr(reply_msg)
ocr_result = await self.run_ocr(reply_msg)
if ocr_result:
try:
await self.mark_spam_ocr(ocr_result, user_id, ctx.chat.id, reply_msg.id)
Expand Down Expand Up @@ -643,7 +651,7 @@ async def cmd_predict(self, ctx: command.Context) -> Optional[str]:
reply_to_message_id=replied.id,
)

ocr_result = await self.runOcr(replied)
ocr_result = await self.run_ocr(replied)
if ocr_result:
ocr_prediction = await self._predict(ocr_result)
if ocr_prediction.size != 0:
Expand Down

0 comments on commit 70a2443

Please sign in to comment.