From d64002f441ed6e33b25f3e75a8be1b6e20f0d7c2 Mon Sep 17 00:00:00 2001 From: j <@> Date: Fri, 2 Sep 2022 14:41:35 -0500 Subject: [PATCH 01/17] Add cmd line args. Readability cleanup. No output change. --- HangoutJsonParser.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index 3110e01..47f92f7 100644 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -1,11 +1,5 @@ -import json - -# location of Hangouts Json file obtained from Google Takeout -with open('/path/to/JSON/data/file.json', 'r', encoding='utf-8') as f: - jsonData = json.load(f) - -simpleJson = [] - +#!/usr/bin/python +import argparse, json, os def parseData(): for i in range(0, len(jsonData['conversations'])): @@ -17,8 +11,7 @@ def parseData(): for j in range(0, len(jsonData['conversations'][i]['events'])): message = {} message['sender'] = {} - message['sender']['name'] = getName( - jsonData['conversations'][i]['events'][j]['sender_id']['gaia_id'], conversation['participants']) + message['sender']['name'] = getName(jsonData['conversations'][i]['events'][j]['sender_id']['gaia_id'], conversation['participants']) message['sender']['id'] = jsonData['conversations'][i]['events'][j]['sender_id']['gaia_id'] message['unixtime'] = (int(jsonData['conversations'][i] ['events'][j]['timestamp']))/1000000 @@ -57,9 +50,9 @@ def getParticipants(index): def getName(id, participants): - for i in range(0, len(participants)): - if id == participants[i]['id']: - return participants[i]['name'] + for p in participants: + if id == p['id']: + return p['name'] return id @@ -76,8 +69,13 @@ def chatName(i): name = participants[index] return name - if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('INPUT_JSON_PATH', help='Path location of Hangouts.json file obtained from Google Takeout.') + parser.add_argument('OUTPUT_DIRECTORY', help='Path to write output files.') + args = parser.parse_args() + + jsonData = json.load(open(args.INPUT_JSON_PATH, 'r', encoding='utf-8')) + simpleJson = [] parseData() - with open("clean_hangoutsData.json", "w", encoding="utf-8") as write_file: - json.dump(simpleJson, write_file, indent=4) + json.dump(simpleJson, open(os.path.join(args.OUTPUT_DIRECTORY, 'clean_hangoutsData.json'), 'w', encoding='utf-8'), indent=4) From f31ea4ef1a107d477afbdd0fad9192744fa17d34 Mon Sep 17 00:00:00 2001 From: j <@> Date: Fri, 2 Sep 2022 14:49:16 -0500 Subject: [PATCH 02/17] Readability cleanup. No output change. --- HangoutJsonParser.py | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index 47f92f7..0189997 100644 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -8,26 +8,23 @@ def parseData(): conversation['participants'] = getParticipants(i) conversation['messages'] = [] - for j in range(0, len(jsonData['conversations'][i]['events'])): + for event in jsonData['conversations'][i]['events']: message = {} message['sender'] = {} - message['sender']['name'] = getName(jsonData['conversations'][i]['events'][j]['sender_id']['gaia_id'], conversation['participants']) - message['sender']['id'] = jsonData['conversations'][i]['events'][j]['sender_id']['gaia_id'] - message['unixtime'] = (int(jsonData['conversations'][i] - ['events'][j]['timestamp']))/1000000 + message['sender']['name'] = getName(event['sender_id']['gaia_id'], conversation['participants']) + message['sender']['id'] = event['sender_id']['gaia_id'] + message['unixtime'] = (int(event['timestamp']))/1000000 - if 'chat_message' in jsonData['conversations'][i]['events'][j]: + if 'chat_message' in event: # if it's a message(normal hangouts, image...) - if 'segment' in jsonData['conversations'][i]['events'][j]['chat_message']['message_content']: + if 'segment' in event['chat_message']['message_content']: # if it's a normal hangouts message content = "" - for k in range(0, len(jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'])): - if jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'][k]['type'] == "TEXT": - content = content + \ - jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'][k]['text'] - elif jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'][k]['type'] == "LINK": - content = content + \ - jsonData['conversations'][i]['events'][j]['chat_message']['message_content']['segment'][k]['text'] + for k in range(0, len(event['chat_message']['message_content']['segment'])): + if event['chat_message']['message_content']['segment'][k]['type'] == "TEXT": + content = content + event['chat_message']['message_content']['segment'][k]['text'] + elif event['chat_message']['message_content']['segment'][k]['type'] == "LINK": + content = content + event['chat_message']['message_content']['segment'][k]['text'] message['content'] = content conversation['messages'].append(message) @@ -59,15 +56,9 @@ def getName(id, participants): def chatName(i): if (('name' in jsonData['conversations'][i]['conversation']['conversation'])and(jsonData['conversations'][i]['conversation']['conversation']['name'] != "")): return jsonData['conversations'][i]['conversation']['conversation']['name'] - participants = [] - index = 0 - for k in range(0, len(simpleJson[i]['participants'])): - participants.append(simpleJson[i]['participants'][k]['name']) - if simpleJson[i]['participants'][k]['id'] == jsonData['conversations'][i]['conversation']['conversation']['self_conversation_state']['self_read_state']['participant_id']['gaia_id']: - index = k - break - name = participants[index] - return name + for part in simpleJson[i]['participants']: + if part['id'] == jsonData['conversations'][i]['conversation']['conversation']['self_conversation_state']['self_read_state']['participant_id']['gaia_id']: + return part['name'] if __name__ == '__main__': parser = argparse.ArgumentParser() From 471765259bd82b274d0814e37b7921f2939c8f94 Mon Sep 17 00:00:00 2001 From: j <@> Date: Fri, 2 Sep 2022 15:01:29 -0500 Subject: [PATCH 03/17] Readability cleanup. No output change. --- HangoutJsonParser.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index 0189997..c99096e 100644 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -5,7 +5,7 @@ def parseData(): for i in range(0, len(jsonData['conversations'])): conversation = {} conversation['chatName'] = "" - conversation['participants'] = getParticipants(i) + conversation['participants'] = getParticipants(jsonData['conversations'][i]['conversation']['conversation']['participant_data']) conversation['messages'] = [] for event in jsonData['conversations'][i]['events']: @@ -33,32 +33,26 @@ def parseData(): simpleJson[i]['chatName'] = chatName(i) -def getParticipants(index): - participants = [] - for i in range(0, len(jsonData['conversations'][index]['conversation']['conversation']['participant_data'])): - person = {} - person['id'] = jsonData['conversations'][index]['conversation']['conversation']['participant_data'][i]['id']['gaia_id'] - if 'fallback_name' in jsonData['conversations'][index]['conversation']['conversation']['participant_data'][i]: - person['name'] = jsonData['conversations'][index]['conversation']['conversation']['participant_data'][i]['fallback_name'] - else: - person['name'] = jsonData['conversations'][index]['conversation']['conversation']['participant_data'][i]['id']['gaia_id'] - participants.append(person) - return participants +def getParticipants(participant_data): + return [{ + 'id': participant['id']['gaia_id'], + 'name': participant.get('fallback_name', participant['id']['gaia_id']) + } for participant in participant_data] -def getName(id, participants): +def getName(pid, participants): for p in participants: - if id == p['id']: + if pid == p['id']: return p['name'] - return id + return pid def chatName(i): if (('name' in jsonData['conversations'][i]['conversation']['conversation'])and(jsonData['conversations'][i]['conversation']['conversation']['name'] != "")): return jsonData['conversations'][i]['conversation']['conversation']['name'] - for part in simpleJson[i]['participants']: - if part['id'] == jsonData['conversations'][i]['conversation']['conversation']['self_conversation_state']['self_read_state']['participant_id']['gaia_id']: - return part['name'] + for participant in simpleJson[i]['participants']: + if participant['id'] == jsonData['conversations'][i]['conversation']['conversation']['self_conversation_state']['self_read_state']['participant_id']['gaia_id']: + return participant['name'] if __name__ == '__main__': parser = argparse.ArgumentParser() From 2d26ce8482faea7aee9d0bd371443880cf293e12 Mon Sep 17 00:00:00 2001 From: j <@> Date: Fri, 2 Sep 2022 15:03:44 -0500 Subject: [PATCH 04/17] Readability cleanup. No output change. --- HangoutJsonParser.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index c99096e..feb27c5 100644 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -5,7 +5,14 @@ def parseData(): for i in range(0, len(jsonData['conversations'])): conversation = {} conversation['chatName'] = "" - conversation['participants'] = getParticipants(jsonData['conversations'][i]['conversation']['conversation']['participant_data']) + conversation['participants'] = [ + { + 'id': participant['id']['gaia_id'], + 'name': participant.get('fallback_name', participant['id']['gaia_id']) + } + for participant in + jsonData['conversations'][i]['conversation']['conversation']['participant_data'] + ] conversation['messages'] = [] for event in jsonData['conversations'][i]['events']: @@ -32,21 +39,12 @@ def parseData(): simpleJson.append(conversation) simpleJson[i]['chatName'] = chatName(i) - -def getParticipants(participant_data): - return [{ - 'id': participant['id']['gaia_id'], - 'name': participant.get('fallback_name', participant['id']['gaia_id']) - } for participant in participant_data] - - def getName(pid, participants): for p in participants: if pid == p['id']: return p['name'] return pid - def chatName(i): if (('name' in jsonData['conversations'][i]['conversation']['conversation'])and(jsonData['conversations'][i]['conversation']['conversation']['name'] != "")): return jsonData['conversations'][i]['conversation']['conversation']['name'] From 2eec2096bdb58284f3132fc267e0daf2cef46a0d Mon Sep 17 00:00:00 2001 From: j <@> Date: Fri, 2 Sep 2022 15:10:45 -0500 Subject: [PATCH 05/17] Readability cleanup. No output change. --- HangoutJsonParser.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index feb27c5..af16a3c 100644 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -2,20 +2,21 @@ import argparse, json, os def parseData(): - for i in range(0, len(jsonData['conversations'])): - conversation = {} - conversation['chatName'] = "" - conversation['participants'] = [ - { - 'id': participant['id']['gaia_id'], - 'name': participant.get('fallback_name', participant['id']['gaia_id']) - } - for participant in - jsonData['conversations'][i]['conversation']['conversation']['participant_data'] - ] - conversation['messages'] = [] + for orig_conv in jsonData['conversations']: + conversation = { + 'chatName': '', + 'participants': [ + { + 'id': participant['id']['gaia_id'], + 'name': participant.get('fallback_name', participant['id']['gaia_id']) + } + for participant in + orig_conv['conversation']['conversation']['participant_data'] + ], + 'messages': [] + } - for event in jsonData['conversations'][i]['events']: + for event in orig_conv['events']: message = {} message['sender'] = {} message['sender']['name'] = getName(event['sender_id']['gaia_id'], conversation['participants']) @@ -36,8 +37,8 @@ def parseData(): conversation['messages'].append(message) + conversation['chatName'] = chatName(orig_conv, conversation['participants']) simpleJson.append(conversation) - simpleJson[i]['chatName'] = chatName(i) def getName(pid, participants): for p in participants: @@ -45,11 +46,11 @@ def getName(pid, participants): return p['name'] return pid -def chatName(i): - if (('name' in jsonData['conversations'][i]['conversation']['conversation'])and(jsonData['conversations'][i]['conversation']['conversation']['name'] != "")): - return jsonData['conversations'][i]['conversation']['conversation']['name'] - for participant in simpleJson[i]['participants']: - if participant['id'] == jsonData['conversations'][i]['conversation']['conversation']['self_conversation_state']['self_read_state']['participant_id']['gaia_id']: +def chatName(orig_conv, participants): + if (('name' in orig_conv['conversation']['conversation']) and (orig_conv['conversation']['conversation']['name'] != "")): + return orig_conv['conversation']['conversation']['name'] + for participant in participants: + if participant['id'] == orig_conv['conversation']['conversation']['self_conversation_state']['self_read_state']['participant_id']['gaia_id']: return participant['name'] if __name__ == '__main__': From 23c2232ce8d721a736b179524d5d53ea6688d335 Mon Sep 17 00:00:00 2001 From: j <@> Date: Fri, 2 Sep 2022 15:15:32 -0500 Subject: [PATCH 06/17] Readability cleanup. No output change. --- HangoutJsonParser.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index af16a3c..6f67e46 100644 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -17,22 +17,24 @@ def parseData(): } for event in orig_conv['events']: - message = {} - message['sender'] = {} - message['sender']['name'] = getName(event['sender_id']['gaia_id'], conversation['participants']) - message['sender']['id'] = event['sender_id']['gaia_id'] - message['unixtime'] = (int(event['timestamp']))/1000000 + message = { + 'sender': { + 'name': getName(event['sender_id']['gaia_id'], conversation['participants']), + 'id': event['sender_id']['gaia_id'] + }, + 'unixtime': int(event['timestamp'])/1000000 + } if 'chat_message' in event: # if it's a message(normal hangouts, image...) if 'segment' in event['chat_message']['message_content']: # if it's a normal hangouts message content = "" - for k in range(0, len(event['chat_message']['message_content']['segment'])): - if event['chat_message']['message_content']['segment'][k]['type'] == "TEXT": - content = content + event['chat_message']['message_content']['segment'][k]['text'] - elif event['chat_message']['message_content']['segment'][k]['type'] == "LINK": - content = content + event['chat_message']['message_content']['segment'][k]['text'] + for segment in event['chat_message']['message_content']['segment']: + if segment['type'] == "TEXT": + content = content + segment['text'] + elif segment['type'] == "LINK": + content = content + segment['text'] message['content'] = content conversation['messages'].append(message) From fa2ae5855984a22155b2d12cf32ad0a2b6c60138 Mon Sep 17 00:00:00 2001 From: j <@> Date: Fri, 2 Sep 2022 16:04:23 -0500 Subject: [PATCH 07/17] Add more outputs in the form of text files. --- HangoutJsonParser.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index 6f67e46..e9a4f1c 100644 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -1,5 +1,5 @@ #!/usr/bin/python -import argparse, json, os +import argparse, datetime, hashlib, json, os def parseData(): for orig_conv in jsonData['conversations']: @@ -65,3 +65,10 @@ def chatName(orig_conv, participants): simpleJson = [] parseData() json.dump(simpleJson, open(os.path.join(args.OUTPUT_DIRECTORY, 'clean_hangoutsData.json'), 'w', encoding='utf-8'), indent=4) + for chat in simpleJson: + filename = ', '.join(i['name'] for i in chat['participants'])+'.txt' + if len(filename) > os.statvfs(args.OUTPUT_DIRECTORY).f_namemax: + filename = hashlib.sha256(filename.encode('ascii')).hexdigest()+'.txt' + with open(os.path.join(args.OUTPUT_DIRECTORY, filename), 'w') as outtext: + for msg in chat['messages']: + outtext.write(datetime.datetime.fromtimestamp(msg['unixtime']).strftime('%Y-%m-%d %H:%M:%S')+' '+msg['sender']['name']+': '+msg.get('content','')+'\n') From 6659dbd8f886dbda59e8cce9e0269b92e2975af9 Mon Sep 17 00:00:00 2001 From: j <@> Date: Fri, 2 Sep 2022 16:11:39 -0500 Subject: [PATCH 08/17] Fix missing linebreaks. --- HangoutJsonParser.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index e9a4f1c..8be1985 100644 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -31,10 +31,11 @@ def parseData(): # if it's a normal hangouts message content = "" for segment in event['chat_message']['message_content']['segment']: - if segment['type'] == "TEXT": - content = content + segment['text'] - elif segment['type'] == "LINK": + if segment['type'] in ('TEXT','LINK'): content = content + segment['text'] + elif segment['type'] == 'LINE_BREAK': + content += '\n' + else: raise segment message['content'] = content conversation['messages'].append(message) From 80dbe88c69ab4a2851c7feef3690e98e8b41cf5b Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 03:08:48 +0000 Subject: [PATCH 09/17] Reorder code for readability and to make it obvious where a bunch of cases were being totally ignored, with TODOs. No output change. --- HangoutJsonParser.py | 52 +++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 18 deletions(-) mode change 100644 => 100755 HangoutJsonParser.py diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py old mode 100644 new mode 100755 index 8be1985..7e346cd --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -17,28 +17,44 @@ def parseData(): } for event in orig_conv['events']: - message = { + def get_readable_content(event): + match event['event_type']: + case 'REGULAR_CHAT_MESSAGE': + content = None + # if it's a message(normal hangouts, image...) + for msg_k,msg_v in event['chat_message']['message_content'].items(): + match msg_k: + case 'segment': # if it's a normal hangouts message + if content == None: content = '' + for segment in msg_v: + match segment['type']: + case 'TEXT'|'LINK': + content += segment['text'] + case 'LINE_BREAK': + content += '\n' + case _: raise segment + case 'attachment': + pass # TODO + case _: raise msg_k + return content + case 'HANGOUT_EVENT': + return None # TODO + case 'ADD_USER'|'REMOVE_USER': + return None # TODO + case 'GROUP_LINK_SHARING_MODIFICATION': + return None # TODO + case 'RENAME_CONVERSATION': + return None # TODO + case _: raise Exception('unhandled event type '+event['event_type']) + + conversation['messages'].append({ 'sender': { 'name': getName(event['sender_id']['gaia_id'], conversation['participants']), 'id': event['sender_id']['gaia_id'] }, - 'unixtime': int(event['timestamp'])/1000000 - } - - if 'chat_message' in event: - # if it's a message(normal hangouts, image...) - if 'segment' in event['chat_message']['message_content']: - # if it's a normal hangouts message - content = "" - for segment in event['chat_message']['message_content']['segment']: - if segment['type'] in ('TEXT','LINK'): - content = content + segment['text'] - elif segment['type'] == 'LINE_BREAK': - content += '\n' - else: raise segment - message['content'] = content - - conversation['messages'].append(message) + 'unixtime': int(event['timestamp'])/1000000, + **({'content':_} if (_:=get_readable_content(event)) != None else {}) + }) conversation['chatName'] = chatName(orig_conv, conversation['participants']) simpleJson.append(conversation) From a7a65bfbcc3a01ed60265dac7b57a6d0bd3cc8f2 Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 03:12:13 +0000 Subject: [PATCH 10/17] Add simple HANGOUT_EVENT text. --- HangoutJsonParser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index 7e346cd..fd1a36a 100755 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -38,7 +38,7 @@ def get_readable_content(event): case _: raise msg_k return content case 'HANGOUT_EVENT': - return None # TODO + return event['event_type']+' '+event['hangout_event']['event_type'] case 'ADD_USER'|'REMOVE_USER': return None # TODO case 'GROUP_LINK_SHARING_MODIFICATION': From 94336c73bc48c0431b52168c679c16c01c57a73a Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 03:28:05 +0000 Subject: [PATCH 11/17] Assert. Why does google pointlessly duplicate that field though. --- HangoutJsonParser.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index fd1a36a..4126002 100755 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -49,7 +49,7 @@ def get_readable_content(event): conversation['messages'].append({ 'sender': { - 'name': getName(event['sender_id']['gaia_id'], conversation['participants']), + 'name': getName(event['sender_id'], conversation['participants']), 'id': event['sender_id']['gaia_id'] }, 'unixtime': int(event['timestamp'])/1000000, @@ -59,11 +59,12 @@ def get_readable_content(event): conversation['chatName'] = chatName(orig_conv, conversation['participants']) simpleJson.append(conversation) -def getName(pid, participants): +def getName(user, participants): + assert user['gaia_id'] == user['chat_id'] for p in participants: - if pid == p['id']: + if user['gaia_id'] == p['id']: return p['name'] - return pid + return user['gaia_id'] def chatName(orig_conv, participants): if (('name' in orig_conv['conversation']['conversation']) and (orig_conv['conversation']['conversation']['name'] != "")): From 7fca2d778961069d0dd4c011a8857d0fa46f8b97 Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 03:31:03 +0000 Subject: [PATCH 12/17] Add ADD_USER/REMOVE_USER text. --- HangoutJsonParser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index 4126002..553432b 100755 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -40,7 +40,12 @@ def get_readable_content(event): case 'HANGOUT_EVENT': return event['event_type']+' '+event['hangout_event']['event_type'] case 'ADD_USER'|'REMOVE_USER': - return None # TODO + ret = ( + event['event_type']+' '+event['membership_change'].pop('type')+' '+event['membership_change'].pop('leave_reason')+' '+ + ' '.join(repr(getName(i, conversation['participants'])) for i in event['membership_change'].pop('participant_id')) + ) + assert event['membership_change'] == {} + return ret case 'GROUP_LINK_SHARING_MODIFICATION': return None # TODO case 'RENAME_CONVERSATION': From ecc1f37461ac465ad8e516362e67f69774fabe6f Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 03:33:20 +0000 Subject: [PATCH 13/17] Add GROUP_LINK_SHARING_MODIFICATION text. --- HangoutJsonParser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index 553432b..9e7d7fd 100755 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -47,7 +47,7 @@ def get_readable_content(event): assert event['membership_change'] == {} return ret case 'GROUP_LINK_SHARING_MODIFICATION': - return None # TODO + return event['event_type']+' '+repr( event['group_link_sharing_modification']) case 'RENAME_CONVERSATION': return None # TODO case _: raise Exception('unhandled event type '+event['event_type']) From 60172b972097af4d3f98881ced46d69df77aff07 Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 03:35:22 +0000 Subject: [PATCH 14/17] Add simple RENAME_CONVERSATION text. --- HangoutJsonParser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index 9e7d7fd..e94e4da 100755 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -49,7 +49,7 @@ def get_readable_content(event): case 'GROUP_LINK_SHARING_MODIFICATION': return event['event_type']+' '+repr( event['group_link_sharing_modification']) case 'RENAME_CONVERSATION': - return None # TODO + return event['event_type']+' '+repr(event['conversation_rename']) case _: raise Exception('unhandled event type '+event['event_type']) conversation['messages'].append({ From 88dd9e2d315564c29994cad825657544c5d33877 Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 14:13:35 +0000 Subject: [PATCH 15/17] Add some not-so-pretty handling of attachments and weird Google+ things, so at least they are not totally lost. --- HangoutJsonParser.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index e94e4da..f188a7e 100755 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -20,12 +20,11 @@ def parseData(): def get_readable_content(event): match event['event_type']: case 'REGULAR_CHAT_MESSAGE': - content = None + content = '' # if it's a message(normal hangouts, image...) for msg_k,msg_v in event['chat_message']['message_content'].items(): match msg_k: case 'segment': # if it's a normal hangouts message - if content == None: content = '' for segment in msg_v: match segment['type']: case 'TEXT'|'LINK': @@ -34,7 +33,15 @@ def get_readable_content(event): content += '\n' case _: raise segment case 'attachment': - pass # TODO + for att in msg_v: + match att['embed_item']['type']: + case ['PLUS_PHOTO']: + content += ' [attachment '+att['embed_item']['plus_photo']['url']+' ]' + case ['PLACE_V2', 'THING_V2', 'THING']: + content += ' [place '+', '.join(k+' '+repr(v) for k,v in att['embed_item']['place_v2'].items())+']' + case ['DYNAMITE_MESSAGE_METADATA']: + content += ' [upload_metadata '+repr(att['embed_item']['dynamite_message_metadata'])+']' + case _: raise Exception('unhandled attachment '+json.dumps(att,indent=4)) case _: raise msg_k return content case 'HANGOUT_EVENT': @@ -58,7 +65,7 @@ def get_readable_content(event): 'id': event['sender_id']['gaia_id'] }, 'unixtime': int(event['timestamp'])/1000000, - **({'content':_} if (_:=get_readable_content(event)) != None else {}) + 'content': get_readable_content(event) }) conversation['chatName'] = chatName(orig_conv, conversation['participants']) @@ -94,4 +101,4 @@ def chatName(orig_conv, participants): filename = hashlib.sha256(filename.encode('ascii')).hexdigest()+'.txt' with open(os.path.join(args.OUTPUT_DIRECTORY, filename), 'w') as outtext: for msg in chat['messages']: - outtext.write(datetime.datetime.fromtimestamp(msg['unixtime']).strftime('%Y-%m-%d %H:%M:%S')+' '+msg['sender']['name']+': '+msg.get('content','')+'\n') + outtext.write(datetime.datetime.fromtimestamp(msg['unixtime']).strftime('%Y-%m-%d %H:%M:%S')+' '+msg['sender']['name']+': '+msg['content']+'\n') From 533141f1867cec45a7da55ad42704e3cc85a52f2 Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 14:44:42 +0000 Subject: [PATCH 16/17] Deal with issue in the Hangouts.json, where some conversations lack a name for a user. First use the locally defined name if available. Fall back to global name, then to no name at all just ID. --- HangoutJsonParser.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index f188a7e..fb6b805 100755 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -2,13 +2,22 @@ import argparse, datetime, hashlib, json, os def parseData(): + global gaia_to_name + # First pass just to collect names, to deal with issue in the Hangouts.json, where some conversations lack a name for a user, when others show the name correctly. + gaia_to_name = {} + for orig_conv in jsonData['conversations']: + for participant in orig_conv['conversation']['conversation']['participant_data']: + assert participant['id']['gaia_id'] == participant['id']['chat_id'] + if 'fallback_name' in participant: + gaia_to_name[participant['id']['gaia_id']] = participant['fallback_name'] + for orig_conv in jsonData['conversations']: conversation = { 'chatName': '', 'participants': [ { 'id': participant['id']['gaia_id'], - 'name': participant.get('fallback_name', participant['id']['gaia_id']) + 'name': participant.get('fallback_name') } for participant in orig_conv['conversation']['conversation']['participant_data'] @@ -49,7 +58,7 @@ def get_readable_content(event): case 'ADD_USER'|'REMOVE_USER': ret = ( event['event_type']+' '+event['membership_change'].pop('type')+' '+event['membership_change'].pop('leave_reason')+' '+ - ' '.join(repr(getName(i, conversation['participants'])) for i in event['membership_change'].pop('participant_id')) + ' '.join(repr(getName(i['gaia_id'],conversation)) for i in event['membership_change'].pop('participant_id')) ) assert event['membership_change'] == {} return ret @@ -61,7 +70,7 @@ def get_readable_content(event): conversation['messages'].append({ 'sender': { - 'name': getName(event['sender_id'], conversation['participants']), + 'name': getName(event['sender_id']['gaia_id'],conversation), 'id': event['sender_id']['gaia_id'] }, 'unixtime': int(event['timestamp'])/1000000, @@ -71,12 +80,13 @@ def get_readable_content(event): conversation['chatName'] = chatName(orig_conv, conversation['participants']) simpleJson.append(conversation) -def getName(user, participants): - assert user['gaia_id'] == user['chat_id'] - for p in participants: - if user['gaia_id'] == p['id']: +def getName(user_id, conversation): + global gaia_to_name + # First use the locally defined one if available. + for p in conversation['participants']: + if user_id == p['id'] and p['name'] != None: return p['name'] - return user['gaia_id'] + return gaia_to_name.get(user_id, user_id) # Fall back to global name, then to no name at all just ID. def chatName(orig_conv, participants): if (('name' in orig_conv['conversation']['conversation']) and (orig_conv['conversation']['conversation']['name'] != "")): @@ -96,7 +106,7 @@ def chatName(orig_conv, participants): parseData() json.dump(simpleJson, open(os.path.join(args.OUTPUT_DIRECTORY, 'clean_hangoutsData.json'), 'w', encoding='utf-8'), indent=4) for chat in simpleJson: - filename = ', '.join(i['name'] for i in chat['participants'])+'.txt' + filename = ', '.join(getName(i['id'],chat) for i in chat['participants'])+'.txt' if len(filename) > os.statvfs(args.OUTPUT_DIRECTORY).f_namemax: filename = hashlib.sha256(filename.encode('ascii')).hexdigest()+'.txt' with open(os.path.join(args.OUTPUT_DIRECTORY, filename), 'w') as outtext: From 56becbcbdcd85215e03859c5e9cc3372695a232d Mon Sep 17 00:00:00 2001 From: j <@> Date: Sat, 3 Sep 2022 14:57:03 +0000 Subject: [PATCH 17/17] Ensure python3 up front. --- HangoutJsonParser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HangoutJsonParser.py b/HangoutJsonParser.py index fb6b805..3375ed5 100755 --- a/HangoutJsonParser.py +++ b/HangoutJsonParser.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 -Bubb import argparse, datetime, hashlib, json, os def parseData(): @@ -101,10 +101,10 @@ def chatName(orig_conv, participants): parser.add_argument('OUTPUT_DIRECTORY', help='Path to write output files.') args = parser.parse_args() - jsonData = json.load(open(args.INPUT_JSON_PATH, 'r', encoding='utf-8')) + jsonData = json.load(open(args.INPUT_JSON_PATH, 'r')) simpleJson = [] parseData() - json.dump(simpleJson, open(os.path.join(args.OUTPUT_DIRECTORY, 'clean_hangoutsData.json'), 'w', encoding='utf-8'), indent=4) + json.dump(simpleJson, open(os.path.join(args.OUTPUT_DIRECTORY, 'clean_hangoutsData.json'), 'w'), indent=4) for chat in simpleJson: filename = ', '.join(getName(i['id'],chat) for i in chat['participants'])+'.txt' if len(filename) > os.statvfs(args.OUTPUT_DIRECTORY).f_namemax: