-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsample.py
52 lines (43 loc) · 2.05 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import json
import os.path
import re
def clean_text(raw_text):
cleaned_text = re.sub(r'[^a-zA-Z.,?!]', ' ', raw_text)
return cleaned_text
def find_text(file_path, target_id):
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
tweet_data = json.loads(line.strip())
tweet_id = str(tweet_data.get('id', None))
tweet_text = tweet_data.get('text', None)
if tweet_id == target_id and tweet_text is not None:
tweet_text = re.sub(r'\s*http\S*$', '', tweet_text)
tweet_text = re.sub(r'\s*https\S*$', '', tweet_text)
tweet_text = re.sub(r'[@#]\S+|:[a-zA-Z]+:', '', tweet_text)
tweet_text = re.sub(r'RT', '', tweet_text)
tweet_text = re.sub(r'[@#]\S+|:[a-zA-Z_]+:|\s+', ' ', tweet_text).strip()
cleaned_text = clean_text(tweet_text)
# print(f"ID: {tweet_id}, Text: {cleaned_text}")
return cleaned_text
else:
print(f"找不到ID为 {target_id} 的记录")
return ""
else:
print(f"找不到 {file_path} ")
return ""
file_path = "D://PyCharm/FS1/UP-MPF-main/datasets/MultiModalDataset/positiveSampleAll.txt"
output_file_path = "D://PyCharm/FS1/UP-MPF-main/datasets/MultiModalDataset/positiveData2.txt"
fin = open(file_path, 'r', encoding='utf-8')
lines = fin.readlines()
with open(output_file_path, 'w', encoding='utf-8') as output_file:
fin = open(file_path, 'r', encoding='utf-8')
lines = fin.readlines()
for line in lines:
line = line.strip()
user_folder, image_number = line.split(maxsplit=1)
text = find_text(f'D://PyCharm/FS1/UP-MPF-main/datasets/MultiModalDataset/positive/{user_folder}/timeline.txt',
image_number)
if len(text) > 1:
output_file.write(f"{user_folder} {image_number} {text} \n")
print(f"Output written to {output_file_path}")