From b5496b33fb99a5d82335f2ba683295ce374258b9 Mon Sep 17 00:00:00 2001 From: flaurent Date: Wed, 5 Feb 2020 16:58:22 +0530 Subject: [PATCH] Extract Hangouts user ids before parsing messages --- parsers/hangouts.py | 37 +++++++++++++++++++++++++------------ utils.py | 5 +++++ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/parsers/hangouts.py b/parsers/hangouts.py index 29b40ac..11112e3 100644 --- a/parsers/hangouts.py +++ b/parsers/hangouts.py @@ -93,22 +93,36 @@ def id_to_name(_id): def save_name_for_id(name, _id): if _id not in id_to_name_map: + log.debug(f'Found name {name}') id_to_name_map[_id] = name + elif id_to_name_map[_id] != name: log.info(f'Assuming {name} is {id_to_name_map[_id]}') - data = [] - log.info('Extracting messages...') - for conversation in archive['conversations']: - conversation_with_id = '' + # prefer the name that is not an email address + if '@' in name: + log.debug(f'Keeping {id_to_name_map[_id]}') + else: + log.debug(f'Keeping {name}') + id_to_name_map[_id] = name - # saves the fallback_name of all participants + # Extract ids before parsing messages, as sometimes the fallback_name is only found after the person's first message + log.info('Extracting all interlocutor ids...') + for conversation in archive['conversations']: if 'conversation' in conversation['conversation']: for participant in conversation['conversation']['conversation']['participant_data']: if 'fallback_name' in participant: - save_name_for_id(participant['fallback_name'], participant['id']['chat_id']) + full_name = participant['fallback_name'] + chat_id = participant['id']['chat_id'] + save_name_for_id(full_name, chat_id) + + data = [] + log.info('Parsing messages...') + for conversation in archive['conversations']: + conversation_with_id = '' for event in conversation['events']: + # there are many types of events, we are only interested in the chat messages with actual text content if 'chat_message' in event and 'segment' in event['chat_message']['message_content']: timestamp = int(event['timestamp']) content = event['chat_message']['message_content'] @@ -127,7 +141,6 @@ def save_name_for_id(name, _id): conversation_with_name = id_to_name(conversation_with_id) if sender_name is not None or conversation_with_name is not None: - # checks that the sender is either own_name or the interlocutor if sender_name != own_name and sender_id != conversation_with_id: log.error(f'Parsing error. Is your own_name {own_name} correct?') @@ -138,7 +151,7 @@ def save_name_for_id(name, _id): # saves the message timestamp = timestamp / 1000000 - outgoing = sender_name == own_name + outgoing = (sender_name == own_name) conversation_with_name = conversation_with_name if conversation_with_name is not None else '' sender_name = sender_name if sender_name is not None else '' data += [[timestamp, conversation_id, conversation_with_name, sender_name, outgoing, text, '', '']] @@ -149,6 +162,7 @@ def save_name_for_id(name, _id): if len(data) >= MAX_EXPORTED_MESSAGES: log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.') return data + return data @@ -160,13 +174,11 @@ def read_archive(file_path): def infer_own_name(archive, min_conversations=2): - '''Infers own name from multiple conversations by finding the person who participated most in the conversations''' + """Infers own name from multiple conversations by finding the person who participated most in the conversations""" participants_conversation_count = defaultdict(int) num_conversations = 0 log.info('Trying to infer own_name from data...') for conversation in archive['conversations']: - conversation_with_id = '' - conversationWithName = '' if 'conversation' in conversation['conversation']: participants = conversation['conversation']['conversation']['participant_data'] participants = [p['fallback_name'] for p in participants if 'fallback_name' in p] @@ -174,8 +186,9 @@ def infer_own_name(archive, min_conversations=2): num_conversations += 1 for p in participants: participants_conversation_count[p] += 1 + if num_conversations >= min_conversations and len(participants_conversation_count.keys()) >= 2: own_name = max(participants_conversation_count, key=participants_conversation_count.get) log.info(f'Successfully inferred own-name to be {own_name}') return own_name - raise Exception('Could not infer own name from existing converstations. Please provide your username manually with the --own-name argument') + raise Exception('Could not infer own name from existing conversations. Please provide your username manually with the --own-name argument') diff --git a/utils.py b/utils.py index 50659a6..a09c888 100644 --- a/utils.py +++ b/utils.py @@ -45,6 +45,11 @@ def load_data(args): log.info(f'Reading data for platform {platform}') _df = pd.read_pickle(data_path) df.append(_df) + + if len(df) == 0: + log.error('No data to load!') + exit(0) + df = pd.concat(df, axis=0, ignore_index=True) original_len = len(df) # filtering