From ee09c472a0d2921e754bcd322bb69921e6e8308b Mon Sep 17 00:00:00 2001 From: mandu Date: Wed, 10 Jul 2024 19:39:23 +0900 Subject: [PATCH 1/7] =?UTF-8?q?modify=20article=5Fdata.csv=20:=20=EB=82=A0?= =?UTF-8?q?=EC=A7=9C=20=EB=8D=B0=EC=9D=B4=ED=84=B0=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data/article_data.csv | 204 +++++++++++++++++++++--------------------- 1 file changed, 102 insertions(+), 102 deletions(-) diff --git a/data/article_data.csv b/data/article_data.csv index 54bff6a..787a45b 100644 --- a/data/article_data.csv +++ b/data/article_data.csv @@ -1,102 +1,102 @@ -article_id,Economy and Business,Politics and Society,Technology and Culture,Sports and Leisure,Opinion and Analysis -1,0.0,1.0,0.0,0.0,0.0 -2,0.0,1.0,0.0,0.0,0.0 -3,0.0,1.0,0.0,0.0,0.0 -4,0.0,0.0,1.0,0.0,0.0 -5,0.0,0.0,1.0,0.0,0.0 -6,0.0,1.0,0.0,0.0,0.0 -7,0.0,0.0,0.0,1.0,0.0 -8,1.0,0.0,0.0,0.0,0.0 -9,1.0,0.0,0.0,0.0,0.0 -10,0.0,0.0,0.0,1.0,0.0 -11,0.0,1.0,0.0,0.0,0.0 -12,0.0,0.0,1.0,0.0,0.0 -13,1.0,0.0,0.0,0.0,0.0 -14,0.0,0.0,0.0,0.0,1.0 -15,0.0,0.0,0.0,0.0,1.0 -16,0.0,0.0,0.0,1.0,0.0 -17,0.0,1.0,0.0,0.0,0.0 -18,1.0,0.0,0.0,0.0,0.0 -19,0.0,1.0,0.0,0.0,0.0 -20,1.0,0.0,0.0,0.0,0.0 -21,0.0,0.0,0.0,1.0,0.0 -22,0.0,0.0,0.0,1.0,0.0 -23,0.0,0.0,0.0,0.0,1.0 -24,0.0,1.0,0.0,0.0,0.0 -25,0.0,0.0,0.0,0.0,1.0 -26,0.0,1.0,0.0,0.0,0.0 -27,0.0,0.0,0.0,0.0,1.0 -28,0.0,0.0,1.0,0.0,0.0 -29,0.0,0.0,1.0,0.0,0.0 -30,0.0,0.0,1.0,0.0,0.0 -31,0.0,0.0,1.0,0.0,0.0 -32,1.0,0.0,0.0,0.0,0.0 -33,0.0,0.0,1.0,0.0,0.0 -34,0.0,0.0,0.0,1.0,0.0 -35,0.0,0.0,0.0,1.0,0.0 -36,1.0,0.0,0.0,0.0,0.0 -37,0.0,0.0,1.0,0.0,0.0 -38,0.0,0.0,1.0,0.0,0.0 -39,0.0,0.0,1.0,0.0,0.0 -40,0.0,0.0,0.0,0.0,1.0 -41,0.0,1.0,0.0,0.0,0.0 -42,0.0,0.0,0.0,0.0,1.0 -43,0.0,1.0,0.0,0.0,0.0 -44,0.0,0.0,1.0,0.0,0.0 -45,0.0,0.0,1.0,0.0,0.0 -46,0.0,0.0,0.0,0.0,1.0 -47,0.0,0.0,0.0,0.0,1.0 -48,0.0,1.0,0.0,0.0,0.0 -49,0.0,0.0,0.0,1.0,0.0 -50,0.0,1.0,0.0,0.0,0.0 -51,0.0,0.0,0.0,0.0,1.0 -52,0.0,0.0,0.0,0.0,1.0 -53,1.0,0.0,0.0,0.0,0.0 -54,0.0,0.0,0.0,0.0,1.0 -55,1.0,0.0,0.0,0.0,0.0 -56,0.0,0.0,0.0,1.0,0.0 -57,0.0,1.0,0.0,0.0,0.0 -58,0.0,1.0,0.0,0.0,0.0 -59,1.0,0.0,0.0,0.0,0.0 -60,0.0,1.0,0.0,0.0,0.0 -61,0.0,0.0,0.0,0.0,1.0 -62,0.0,0.0,1.0,0.0,0.0 -63,1.0,0.0,0.0,0.0,0.0 -64,0.0,1.0,0.0,0.0,0.0 -65,1.0,0.0,0.0,0.0,0.0 -66,1.0,0.0,0.0,0.0,0.0 -67,0.0,0.0,1.0,0.0,0.0 -68,0.0,0.0,0.0,0.0,1.0 -69,1.0,0.0,0.0,0.0,0.0 -70,0.0,1.0,0.0,0.0,0.0 -71,0.0,0.0,0.0,1.0,0.0 -72,1.0,0.0,0.0,0.0,0.0 -73,1.0,0.0,0.0,0.0,0.0 -74,0.0,0.0,1.0,0.0,0.0 -75,0.0,0.0,0.0,0.0,1.0 -76,0.0,0.0,0.0,1.0,0.0 -77,0.0,1.0,0.0,0.0,0.0 -78,0.0,0.0,0.0,1.0,0.0 -79,0.0,1.0,0.0,0.0,0.0 -80,0.0,0.0,0.0,0.0,1.0 -81,0.0,1.0,0.0,0.0,0.0 -82,0.0,0.0,1.0,0.0,0.0 -83,0.0,0.0,1.0,0.0,0.0 -84,0.0,0.0,1.0,0.0,0.0 -85,0.0,0.0,1.0,0.0,0.0 -86,0.0,0.0,0.0,1.0,0.0 -87,0.0,0.0,0.0,0.0,1.0 -88,0.0,1.0,0.0,0.0,0.0 -89,0.0,1.0,0.0,0.0,0.0 -90,0.0,0.0,1.0,0.0,0.0 -91,0.0,0.0,1.0,0.0,0.0 -92,1.0,0.0,0.0,0.0,0.0 -93,0.0,0.0,0.0,0.0,1.0 -94,0.0,0.0,0.0,1.0,0.0 -95,0.0,1.0,0.0,0.0,0.0 -96,1.0,0.0,0.0,0.0,0.0 -97,1.0,0.0,0.0,0.0,0.0 -98,0.0,1.0,0.0,0.0,0.0 -99,0.0,0.0,0.0,1.0,0.0 -100,1.0,0.0,0.0,0.0,0.0 -101,0.0,1.0,0.0,0.0,0.0 +article_id,Economy and Business,Politics and Society,Technology and Culture,Sports and Leisure,Opinion and Analysis,created at +1,0.0,1.0,0.0,0.0,0.0,2023-01-01 +2,0.0,1.0,0.0,0.0,0.0,2023-01-02 +3,0.0,1.0,0.0,0.0,0.0,2023-01-03 +4,0.0,0.0,1.0,0.0,0.0,2023-01-04 +5,0.0,0.0,1.0,0.0,0.0,2023-01-05 +6,0.0,1.0,0.0,0.0,0.0,2023-01-06 +7,0.0,0.0,0.0,1.0,0.0,2023-01-07 +8,1.0,0.0,0.0,0.0,0.0,2023-01-08 +9,1.0,0.0,0.0,0.0,0.0,2023-01-09 +10,0.0,0.0,0.0,1.0,0.0,2023-01-10 +11,0.0,1.0,0.0,0.0,0.0,2023-01-11 +12,0.0,0.0,1.0,0.0,0.0,2023-01-12 +13,1.0,0.0,0.0,0.0,0.0,2023-01-13 +14,0.0,0.0,0.0,0.0,1.0,2023-01-14 +15,0.0,0.0,0.0,0.0,1.0,2023-01-15 +16,0.0,0.0,0.0,1.0,0.0,2023-01-16 +17,0.0,1.0,0.0,0.0,0.0,2023-01-17 +18,1.0,0.0,0.0,0.0,0.0,2023-01-18 +19,0.0,1.0,0.0,0.0,0.0,2023-01-19 +20,1.0,0.0,0.0,0.0,0.0,2023-01-20 +21,0.0,0.0,0.0,1.0,0.0,2023-01-21 +22,0.0,0.0,0.0,1.0,0.0,2023-01-22 +23,0.0,0.0,0.0,0.0,1.0,2023-01-23 +24,0.0,1.0,0.0,0.0,0.0,2023-01-24 +25,0.0,0.0,0.0,0.0,1.0,2023-01-25 +26,0.0,1.0,0.0,0.0,0.0,2023-01-26 +27,0.0,0.0,0.0,0.0,1.0,2023-01-27 +28,0.0,0.0,1.0,0.0,0.0,2023-01-28 +29,0.0,0.0,1.0,0.0,0.0,2023-01-29 +30,0.0,0.0,1.0,0.0,0.0,2023-01-30 +31,0.0,0.0,1.0,0.0,0.0,2023-01-31 +32,1.0,0.0,0.0,0.0,0.0,2023-02-01 +33,0.0,0.0,1.0,0.0,0.0,2023-02-02 +34,0.0,0.0,0.0,1.0,0.0,2023-02-03 +35,0.0,0.0,0.0,1.0,0.0,2023-02-04 +36,1.0,0.0,0.0,0.0,0.0,2023-02-05 +37,0.0,0.0,1.0,0.0,0.0,2023-02-06 +38,0.0,0.0,1.0,0.0,0.0,2023-02-07 +39,0.0,0.0,1.0,0.0,0.0,2023-02-08 +40,0.0,0.0,0.0,0.0,1.0,2023-02-09 +41,0.0,1.0,0.0,0.0,0.0,2023-02-10 +42,0.0,0.0,0.0,0.0,1.0,2023-02-11 +43,0.0,1.0,0.0,0.0,0.0,2023-02-12 +44,0.0,0.0,1.0,0.0,0.0,2023-02-13 +45,0.0,0.0,1.0,0.0,0.0,2023-02-14 +46,0.0,0.0,0.0,0.0,1.0,2023-02-15 +47,0.0,0.0,0.0,0.0,1.0,2023-02-16 +48,0.0,1.0,0.0,0.0,0.0,2023-02-17 +49,0.0,0.0,0.0,1.0,0.0,2023-02-18 +50,0.0,1.0,0.0,0.0,0.0,2023-02-19 +51,0.0,0.0,0.0,0.0,1.0,2023-02-20 +52,0.0,0.0,0.0,0.0,1.0,2023-02-21 +53,1.0,0.0,0.0,0.0,0.0,2023-02-22 +54,0.0,0.0,0.0,0.0,1.0,2023-02-23 +55,1.0,0.0,0.0,0.0,0.0,2023-02-24 +56,0.0,0.0,0.0,1.0,0.0,2023-02-25 +57,0.0,1.0,0.0,0.0,0.0,2023-02-26 +58,0.0,1.0,0.0,0.0,0.0,2023-02-27 +59,1.0,0.0,0.0,0.0,0.0,2023-02-28 +60,0.0,1.0,0.0,0.0,0.0,2023-03-01 +61,0.0,0.0,0.0,0.0,1.0,2023-03-02 +62,0.0,0.0,1.0,0.0,0.0,2023-03-03 +63,1.0,0.0,0.0,0.0,0.0,2023-03-04 +64,0.0,1.0,0.0,0.0,0.0,2023-03-05 +65,1.0,0.0,0.0,0.0,0.0,2023-03-06 +66,1.0,0.0,0.0,0.0,0.0,2023-03-07 +67,0.0,0.0,1.0,0.0,0.0,2023-03-08 +68,0.0,0.0,0.0,0.0,1.0,2023-03-09 +69,1.0,0.0,0.0,0.0,0.0,2023-03-10 +70,0.0,1.0,0.0,0.0,0.0,2023-03-11 +71,0.0,0.0,0.0,1.0,0.0,2023-03-12 +72,1.0,0.0,0.0,0.0,0.0,2023-03-13 +73,1.0,0.0,0.0,0.0,0.0,2023-03-14 +74,0.0,0.0,1.0,0.0,0.0,2023-03-15 +75,0.0,0.0,0.0,0.0,1.0,2023-03-16 +76,0.0,0.0,0.0,1.0,0.0,2023-03-17 +77,0.0,1.0,0.0,0.0,0.0,2023-03-18 +78,0.0,0.0,0.0,1.0,0.0,2023-03-19 +79,0.0,1.0,0.0,0.0,0.0,2023-03-20 +80,0.0,0.0,0.0,0.0,1.0,2023-03-21 +81,0.0,1.0,0.0,0.0,0.0,2023-03-22 +82,0.0,0.0,1.0,0.0,0.0,2023-03-23 +83,0.0,0.0,1.0,0.0,0.0,2023-03-24 +84,0.0,0.0,1.0,0.0,0.0,2023-03-25 +85,0.0,0.0,1.0,0.0,0.0,2023-03-26 +86,0.0,0.0,0.0,1.0,0.0,2023-03-27 +87,0.0,0.0,0.0,0.0,1.0,2023-03-28 +88,0.0,1.0,0.0,0.0,0.0,2023-03-29 +89,0.0,1.0,0.0,0.0,0.0,2023-03-30 +90,0.0,0.0,1.0,0.0,0.0,2023-03-31 +91,0.0,0.0,1.0,0.0,0.0,2023-04-01 +92,1.0,0.0,0.0,0.0,0.0,2023-04-02 +93,0.0,0.0,0.0,0.0,1.0,2023-04-03 +94,0.0,0.0,0.0,1.0,0.0,2023-04-04 +95,0.0,1.0,0.0,0.0,0.0,2023-04-05 +96,1.0,0.0,0.0,0.0,0.0,2023-04-06 +97,1.0,0.0,0.0,0.0,0.0,2023-04-07 +98,0.0,1.0,0.0,0.0,0.0,2023-04-08 +99,0.0,0.0,0.0,1.0,0.0,2023-04-09 +100,1.0,0.0,0.0,0.0,0.0,2023-04-10 +101,0.0,1.0,0.0,0.0,0.0,2023-04-11 From d46403f8425614652a9b25d034ad4b4aa87f65e5 Mon Sep 17 00:00:00 2001 From: mandu Date: Wed, 10 Jul 2024 19:39:50 +0900 Subject: [PATCH 2/7] =?UTF-8?q?modify=20interaction=5Fdata.csv=20:=20?= =?UTF-8?q?=ED=95=84=EC=9A=94=EC=97=86=EB=8A=94=20=EC=97=B4=20=EC=A0=9C?= =?UTF-8?q?=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data/interaction_data.csv | 131 +++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/data/interaction_data.csv b/data/interaction_data.csv index f10db4f..87d6b40 100644 --- a/data/interaction_data.csv +++ b/data/interaction_data.csv @@ -1,66 +1,65 @@ -classification_id,article_id,duration_time,user_id -13.0,41,98, -20.0,74,65, -3.0,1,1, -18.0,50,63, -1.0,42,64, -15.0,61,33, -19.0,47,83, -4.0,27,17, -12.0,58,59, -6.0,38,92, -16.0,60,75, -14.0,72,77, -17.0,75,50, -11.0,62,20, -7.0,36,79, -5.0,20,27, -9.0,19,45, -8.0,19,30, -10.0,44,23, -2.0,8,14, -2.0,58,53, -17.0,87,54, -14.0,17,6, -1.0,12,12, -12.0,15,92, -11.0,19,30, -12.0,62,6, -20.0,100,38, -10.0,32,37, -19.0,57,3, -5.0,76,94, -18.0,53,39, -4.0,91,18, -14.0,16,5, -12.0,45,74, -13.0,77,21, -2.0,56,69, -9.0,36,66, -7.0,28,38, -15.0,24,57, -15.0,39,98, -10.0,54,33, -8.0,85,95, -20.0,36,10, -7.0,8,13, -14.0,67,26, -16.0,16,34, -10.0,70,10, -6.0,80,85, -5.0,14,34, -6.0,50,63, -1.0,96,54, -8.0,38,92, -3.0,83,29, -11.0,91,72, -7.0,92,64, -2.0,36,73, -17.0,63,92, -9.0,75,73, -16.0,39,85, -1.0,61,74, -20.0,76,90, -6.0,90,85, -12.0,19,30, -,101,5,101.0 +classification_id,article_id,duration_time +13.0,41,98 +20.0,74,65 +3.0,1,1 +18.0,50,63 +1.0,42,64 +15.0,61,33 +19.0,47,83 +4.0,27,17 +12.0,58,59 +6.0,38,92 +16.0,60,75 +14.0,72,77 +17.0,75,50 +11.0,62,20 +7.0,36,79 +5.0,20,27 +9.0,19,45 +8.0,19,30 +10.0,44,23 +2.0,8,14 +2.0,58,53 +17.0,87,54 +14.0,17,6 +1.0,12,12 +12.0,15,92 +11.0,19,30 +12.0,62,6 +20.0,100,38 +10.0,32,37 +19.0,57,3 +5.0,76,94 +18.0,53,39 +4.0,91,18 +14.0,16,5 +12.0,45,74 +13.0,77,21 +2.0,56,69 +9.0,36,66 +7.0,28,38 +15.0,24,57 +15.0,39,98 +10.0,54,33 +8.0,85,95 +20.0,36,10 +7.0,8,13 +14.0,67,26 +16.0,16,34 +10.0,70,10 +6.0,80,85 +5.0,14,34 +6.0,50,63 +1.0,96,54 +8.0,38,92 +3.0,83,29 +11.0,91,72 +7.0,92,64 +2.0,36,73 +17.0,63,92 +9.0,75,73 +16.0,39,85 +1.0,61,74 +20.0,76,90 +6.0,90,85 +12.0,19,30 \ No newline at end of file From 306ca692fc5122255bc641ac8a4171c8c3210d80 Mon Sep 17 00:00:00 2001 From: mandu Date: Wed, 10 Jul 2024 19:40:46 +0900 Subject: [PATCH 3/7] =?UTF-8?q?feat=20recommend=5Fservice.py=20:=20?= =?UTF-8?q?=EC=B5=9C=EC=8B=A0=20=EA=B8=B0=EC=82=AC=EC=97=90=20=EA=B0=80?= =?UTF-8?q?=EC=A4=91=EC=B9=98=EA=B0=80=20=EC=B6=94=EA=B0=80=EB=90=98?= =?UTF-8?q?=EB=8F=84=EB=A1=9D=20=EC=BD=94=EB=93=9C=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recommend_service.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/recommend_service.py b/recommend_service.py index e1036f8..5cb2f5c 100644 --- a/recommend_service.py +++ b/recommend_service.py @@ -6,18 +6,21 @@ import numpy as np from lightfm.cross_validation import random_train_test_split import os -from scipy.sparse import csr_matrix +from scipy.sparse import csr_matrix, coo_matrix import pandas as pd +from datetime import datetime + class ArticleDataInfo: - def __init__(self, article_id, category): + def __init__(self, article_id, category, created_at): self.article_data = pd.DataFrame({ 'article_id' : article_id, 'Economy and Business' : [0], 'Politics and Society' : [0], 'Technology and Culture' : [0], 'Sports and Leisure' : [0], - 'Opinion and Analysis' : [0] + 'Opinion and Analysis' : [0], + 'created at' : [created_at] }) self.article_data.iloc[0][category] = 1 @@ -53,8 +56,8 @@ def make_dataset(self): self.user_feat = self.user_datas.drop(columns =['classification_id']).to_dict(orient='records') self.item_features = self.article_datas - self.item_features_col = self.item_features.drop(columns=['article_id']).columns.values - self.item_feat = self.item_features.drop(columns =['article_id']).to_dict(orient='records') + self.item_features_col = self.item_features.drop(columns=['article_id', 'created at']).columns.values + self.item_feat = self.item_features.drop(columns =['article_id', 'created at']).to_dict(orient='records') self.dataset = Dataset() self.dataset.fit(users=[x for x in self.user_datas['classification_id']], items=[x for x in self.article_datas['article_id']], item_features=self.item_features_col, user_features=self.user_features_col) @@ -62,8 +65,11 @@ def make_dataset(self): self.item_features = self.dataset.build_item_features((x,y) for x,y in zip(self.item_features['article_id'], self.item_feat)) self.user_features = self.dataset.build_user_features((x,y) for x,y in zip(self.user_datas['classification_id'], self.user_feat)) - (self.interactions, self.weights) = self.dataset.build_interactions((x, y) - for x,y in zip(self.interaction_datas['classification_id'], self.interaction_datas['article_id'])) + (self.interactions, self.weights) = self.dataset.build_interactions((x, y, z * self.get_time_weight(y)) + for x,y, z in zip( + self.interaction_datas['classification_id'], + self.interaction_datas['article_id'], + self.interaction_datas['duration_time'])) num_users, num_items = self.dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) @@ -78,9 +84,7 @@ def make_model(self, n_components:int = 30, loss:str = 'warp', epoch:int = 30, n def fit_model(self): self.make_dataset() self.make_model() - # self.train, self.test = random_train_test_split(self.interactions,test_percentage=0.2, random_state=779) - # self.train_w, self.test_w = random_train_test_split(self.weights, test_percentage=0.2, random_state=779) - self.model.fit(self.interactions, user_features= self.user_features, item_features= self.item_features, epochs=self.epoch,num_threads = self.num_thread, sample_weight = self.weights) + self.model.fit(self.interactions, user_features= self.user_features, item_features= self.item_features, epochs=self.epoch,num_threads = self.num_thread, sample_weight = self.weights) def get_top_n_articles(self, user_id:int, article_num:int): item_ids = np.arange(self.interactions.shape[1]) # 예측할 아이템 ID 배열 @@ -104,6 +108,12 @@ def similar_items(self, item_id, N=10): # scores_new_user = self.model.predict(user_ids = 0,item_ids = np.arange(self.interactions.shape[1]), user_features=new_user) # top_items_new_user = self.article_datas.iloc[np.argsort(-scores_new_user)] # return top_items_new_user[:N] + def get_time_weight(self, article_id): + today = datetime.now().date() + date_obj = datetime.strptime(self.article_datas[self.article_datas['article_id'] == article_id]['created at'].iloc[0], "%Y-%m-%d").date() + difference = today - date_obj + return max(1 - ((difference.days//30)/5), 0) + def add_interaction_data(self, interaction_data:InteractionDataInfo): df = pd.read_csv(self.interaction_data_path) From df02ba28053d3333df54f35e93edf3a052f8390b Mon Sep 17 00:00:00 2001 From: mandu Date: Wed, 10 Jul 2024 19:41:31 +0900 Subject: [PATCH 4/7] =?UTF-8?q?feat=20recommend=5Fservice.py=20:=20article?= =?UTF-8?q?=EC=9D=B4=20=EC=B6=94=EA=B0=80=EB=90=A0=20=EB=95=8C=20=EB=B6=80?= =?UTF-8?q?=EB=B6=84=EC=A0=81=EC=9D=B8=20model=20fit=20=ED=95=A8=EC=88=98?= =?UTF-8?q?=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recommend_service.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/recommend_service.py b/recommend_service.py index 5cb2f5c..47725b6 100644 --- a/recommend_service.py +++ b/recommend_service.py @@ -114,6 +114,9 @@ def get_time_weight(self, article_id): difference = today - date_obj return max(1 - ((difference.days//30)/5), 0) + def fit_model_partialy(self): + self.make_dataset + self.model.fit_partial(self.interactions, item_features=self.item_features) def add_interaction_data(self, interaction_data:InteractionDataInfo): df = pd.read_csv(self.interaction_data_path) From dea2cbdc804511433855d7ac8a319213bf393ca7 Mon Sep 17 00:00:00 2001 From: mandu Date: Wed, 10 Jul 2024 19:41:54 +0900 Subject: [PATCH 5/7] =?UTF-8?q?refact=20recommend=5Fservice.py=20:=20?= =?UTF-8?q?=ED=95=84=EC=9A=94=EC=97=86=EB=8A=94=20=EC=BD=94=EB=93=9C=20?= =?UTF-8?q?=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recommend_service.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/recommend_service.py b/recommend_service.py index 47725b6..0a432f3 100644 --- a/recommend_service.py +++ b/recommend_service.py @@ -101,13 +101,6 @@ def similar_items(self, item_id, N=10): return self.article_datas.iloc[best] - # def items_for_new_user(self, new_user_data:UserDataInfo, N:int): - # new_user = new_user_data.get_user_data() - # print(new_user) - # new_user = csr_matrix(new_user) - # scores_new_user = self.model.predict(user_ids = 0,item_ids = np.arange(self.interactions.shape[1]), user_features=new_user) - # top_items_new_user = self.article_datas.iloc[np.argsort(-scores_new_user)] - # return top_items_new_user[:N] def get_time_weight(self, article_id): today = datetime.now().date() date_obj = datetime.strptime(self.article_datas[self.article_datas['article_id'] == article_id]['created at'].iloc[0], "%Y-%m-%d").date() From 78090d56b02e032b09501c402810e1e95b0b76f8 Mon Sep 17 00:00:00 2001 From: mandu Date: Mon, 5 Aug 2024 22:56:12 +0900 Subject: [PATCH 6/7] =?UTF-8?q?feat(recommend=5Fservice.py)=20:=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recommend_service.py | 168 ++++++++++++++++++++++++------------------- 1 file changed, 96 insertions(+), 72 deletions(-) diff --git a/recommend_service.py b/recommend_service.py index 0a432f3..0013139 100644 --- a/recommend_service.py +++ b/recommend_service.py @@ -1,124 +1,148 @@ +import asyncio import warnings -warnings.filterwarnings('ignore') +import pandas as pd +from datetime import datetime +import aiofiles +from concurrent.futures import ThreadPoolExecutor from lightfm import LightFM from lightfm.data import Dataset -from lightfm.evaluation import precision_at_k, recall_at_k, auc_score import numpy as np -from lightfm.cross_validation import random_train_test_split -import os -from scipy.sparse import csr_matrix, coo_matrix -import pandas as pd +import io -from datetime import datetime +warnings.filterwarnings('ignore') class ArticleDataInfo: def __init__(self, article_id, category, created_at): self.article_data = pd.DataFrame({ - 'article_id' : article_id, - 'Economy and Business' : [0], - 'Politics and Society' : [0], - 'Technology and Culture' : [0], - 'Sports and Leisure' : [0], - 'Opinion and Analysis' : [0], - 'created at' : [created_at] - }) + 'article_id': [article_id], + 'Economy and Business': [0], + 'Politics and Society': [0], + 'Technology and Culture': [0], + 'Sports and Leisure': [0], + 'Opinion and Analysis': [0], + 'created at': [created_at] + }) self.article_data.iloc[0][category] = 1 class InteractionDataInfo: def __init__(self, user_id, article_id, duration_time): self.interaction_data = pd.DataFrame({ - 'user_id' : [user_id], - 'article_id' : [article_id], - 'duration_time' : [duration_time] + 'user_id': [user_id], + 'article_id': [article_id], + 'duration_time': [duration_time] }) class RecommendService: def __init__(self): - self.set_user_datas('data/user_classification.csv') - self.set_article_datas('data/article_data.csv') - self.set_interaction_datas('data/interaction_data.csv') - - def set_user_datas(self, user_data_path): + asyncio.run(self.init_data()) + + async def init_data(self): + await self.set_user_datas('data/user_classification.csv') + await self.set_article_datas('data/article_data.csv') + await self.set_interaction_datas('data/interaction_data.csv') + + async def set_user_datas(self, user_data_path): self.user_data_path = user_data_path - self.user_datas = pd.read_csv(user_data_path) - - def set_article_datas(self, article_data_path): + self.user_datas = await self.read_csv(user_data_path) + + async def set_article_datas(self, article_data_path): self.article_data_path = article_data_path - self.article_datas = pd.read_csv(article_data_path) - - def set_interaction_datas(self, interaction_data_path): + self.article_datas = await self.read_csv(article_data_path) + + async def set_interaction_datas(self, interaction_data_path): self.interaction_data_path = interaction_data_path - self.interaction_datas = pd.read_csv(interaction_data_path) - + self.interaction_datas = await self.read_csv(interaction_data_path) + + async def read_csv(self, path): + async with aiofiles.open(path, mode='r') as file: + data = await file.read() + return pd.read_csv(io.StringIO(data)) + def make_dataset(self): self.user_datas = pd.get_dummies(self.user_datas) - self.user_features_col = self.user_datas.drop(columns =['classification_id']).columns.values - self.user_feat = self.user_datas.drop(columns =['classification_id']).to_dict(orient='records') - + self.user_features_col = self.user_datas.drop(columns=['classification_id']).columns.values + self.user_feat = self.user_datas.drop(columns=['classification_id']).to_dict(orient='records') + self.item_features = self.article_datas self.item_features_col = self.item_features.drop(columns=['article_id', 'created at']).columns.values - self.item_feat = self.item_features.drop(columns =['article_id', 'created at']).to_dict(orient='records') - + self.item_feat = self.item_features.drop(columns=['article_id', 'created at']).to_dict(orient='records') + self.dataset = Dataset() - self.dataset.fit(users=[x for x in self.user_datas['classification_id']], items=[x for x in self.article_datas['article_id']], item_features=self.item_features_col, user_features=self.user_features_col) - - self.item_features = self.dataset.build_item_features((x,y) for x,y in zip(self.item_features['article_id'], self.item_feat)) - self.user_features = self.dataset.build_user_features((x,y) for x,y in zip(self.user_datas['classification_id'], self.user_feat)) - + self.dataset.fit(users=[x for x in self.user_datas['classification_id']], + items=[x for x in self.article_datas['article_id']], + item_features=self.item_features_col, + user_features=self.user_features_col) + + self.item_features = self.dataset.build_item_features((x, y) for x, y in zip(self.item_features['article_id'], self.item_feat)) + self.user_features = self.dataset.build_user_features((x, y) for x, y in zip(self.user_datas['classification_id'], self.user_feat)) + (self.interactions, self.weights) = self.dataset.build_interactions((x, y, z * self.get_time_weight(y)) - for x,y, z in zip( - self.interaction_datas['classification_id'], - self.interaction_datas['article_id'], - self.interaction_datas['duration_time'])) - + for x, y, z in zip( + self.interaction_datas['classification_id'], + self.interaction_datas['article_id'], + self.interaction_datas['duration_time'])) + num_users, num_items = self.dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) - - def make_model(self, n_components:int = 30, loss:str = 'warp', epoch:int = 30, num_thread:int = 4): + + def make_model(self, n_components: int = 30, loss: str = 'warp', epoch: int = 30, num_thread: int = 4): self.n_components = n_components self.loss = loss self.epoch = epoch self.num_thread = num_thread - self.model = LightFM(no_components= self.n_components, loss=self.loss, random_state = 1616) - - def fit_model(self): + self.model = LightFM(no_components=self.n_components, loss=self.loss, random_state=1616) + + async def fit_model(self): + loop = asyncio.get_event_loop() + with ThreadPoolExecutor() as pool: + await loop.run_in_executor(pool, self.sync_fit_model) + + def sync_fit_model(self): self.make_dataset() self.make_model() - self.model.fit(self.interactions, user_features= self.user_features, item_features= self.item_features, epochs=self.epoch,num_threads = self.num_thread, sample_weight = self.weights) - - def get_top_n_articles(self, user_id:int, article_num:int): + self.model.fit(self.interactions, user_features=self.user_features, item_features=self.item_features, epochs=self.epoch, num_threads=self.num_thread, sample_weight=self.weights) + + def get_top_n_articles(self, user_id: int, article_num: int): item_ids = np.arange(self.interactions.shape[1]) # 예측할 아이템 ID 배열 predictions = self.model.predict(user_id, item_ids) top_items = self.article_datas.iloc[np.argsort(-predictions)[:article_num]] return top_items - + def similar_items(self, item_id, N=10): - item_bias ,item_representations = self.model.get_item_representations(features=self.item_features) + item_bias, item_representations = self.model.get_item_representations(features=self.item_features) scores = item_representations.dot(item_representations[item_id, :]) best = np.argpartition(scores, -N)[-N:] - + return self.article_datas.iloc[best] - + def get_time_weight(self, article_id): today = datetime.now().date() date_obj = datetime.strptime(self.article_datas[self.article_datas['article_id'] == article_id]['created at'].iloc[0], "%Y-%m-%d").date() difference = today - date_obj - return max(1 - ((difference.days//30)/5), 0) - - def fit_model_partialy(self): - self.make_dataset - self.model.fit_partial(self.interactions, item_features=self.item_features) - - def add_interaction_data(self, interaction_data:InteractionDataInfo): - df = pd.read_csv(self.interaction_data_path) + return max(1 - ((difference.days // 30) / 5), 0) + + async def add_interaction_data(self, interaction_data: InteractionDataInfo): + df = await self.read_csv(self.interaction_data_path) df = pd.concat([df, interaction_data.interaction_data]) - df.to_csv(self.interaction_data_path, index=False) - print("interactin is added") - - def add_article_data(self, article_data:ArticleDataInfo): - df = pd.read_csv(self.article_data_path) + await self.write_csv(df, self.interaction_data_path) + print("interaction is added") + + async def add_article_data(self, article_data: ArticleDataInfo): + df = await self.read_csv(self.article_data_path) df = pd.concat([df, article_data.article_data]) - df.to_csv(self.article_data_path, index=False) + await self.write_csv(df, self.article_data_path) print("article is added") + + async def write_csv(self, df, path): + async with aiofiles.open(path, mode='w') as file: + await file.write(df.to_csv(index=False)) + +# Example usage: +# recommend_service = RecommendService() +# asyncio.run(recommend_service.fit_model()) +# print(recommend_service.get_top_n_articles(1, 5)) +# print(recommend_service.similar_items(1)) +# asyncio.run(recommend_service.add_article_data(ArticleDataInfo(101, 'Politics and Society', '2024-07-01'))) +# asyncio.run(recommend_service.add_interaction_data(InteractionDataInfo(101, 101, 5))) From 2dbeef4a83d6ad9d3bb702690c7679d2c47ad7f8 Mon Sep 17 00:00:00 2001 From: mandu Date: Mon, 5 Aug 2024 22:56:32 +0900 Subject: [PATCH 7/7] =?UTF-8?q?feat(main.py)=20:=20=ED=85=8C=EC=8A=A4?= =?UTF-8?q?=ED=8A=B8=EC=9A=A9=20=EB=A1=9C=EC=A7=81=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..9499575 --- /dev/null +++ b/main.py @@ -0,0 +1,31 @@ +from user_classification import user_data_to_classification_id +from recommend_service import RecommendService +import pandas as pd +import asyncio + +async def main(): + user_data_path = 'data/user_data_classified.csv' + user_datas = pd.read_csv(user_data_path) + recommendService = RecommendService() + await recommendService.fit_model() + user_id = 1 + print(user_data_to_classification_id( + user_datas.iloc[user_id]['sex'], + user_datas.iloc[user_id]['issue finder'], + user_datas.iloc[user_id]['lifestyle consumer'], + user_datas.iloc[user_id]['entertainer'], + user_datas.iloc[user_id]['tech specialist'], + user_datas.iloc[user_id]['professionals'] + )) + print(recommendService.get_top_n_articles( + user_data_to_classification_id( + user_datas.iloc[user_id]['sex'], + user_datas.iloc[user_id]['issue finder'], + user_datas.iloc[user_id]['lifestyle consumer'], + user_datas.iloc[user_id]['entertainer'], + user_datas.iloc[user_id]['tech specialist'], + user_datas.iloc[user_id]['professionals'] + ), 5)) + +if __name__ == '__main__': + main() \ No newline at end of file