This repository has been archived by the owner on Dec 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
splitter.py
67 lines (54 loc) · 1.77 KB
/
splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import relatio
from dateutil import parser
from relatio.utils import split_into_sentences
from relatio.wrappers import run_srl
# Path for files to be annotated
path_to_data_sets = (
"/cluster/home/sidray/work/Ash_Galletta_Widmer/data/scrapes_since_1980"
)
list_of_files = sorted(os.listdir(path_to_data_sets))
path_to_save = "/cluster/work/lawecon/Projects/annot_data_Ash_Widmer/non_daily_data"
semantics_dict = {}
file_path = sys.argv[1]
file_name = file_path.split("/")[-1]
print(file_path, file_name)
# exit()
# Testing for one particular file
# file_name = "2005-01-02.csv"
# file_path = path_to_data_sets + "/" + file_name
# only .csv files matter
if file_name.endswith(".csv"):
print(file_name)
absolute_file_path = file_path
original_df = pd.read_csv(absolute_file_path)
print(original_df.head())
folder_name = file_name.split(".")[0][-4:]
print(folder_name)
temp_path = path_to_data_sets + "/" + folder_name
print(temp_path)
if Path(temp_path).is_dir():
print("yes")
else:
print("no")
new_path = os.path.join(path_to_data_sets, folder_name)
os.mkdir(new_path)
print("New path is: {}".format(new_path))
new_path = temp_path
# print(Path(path_to_data_sets + folder_name).is_dir())
dfs = dict(tuple(original_df.groupby("date")))
for i, df in dfs.items():
date_text = df["date"].iloc[0]
date = parser.parse(date_text)
date = date.strftime("%Y-%m-%d")
print(date)
df = df.reset_index(drop=True)
# print(df.head())
# print(df.tail())
df.to_csv(new_path + "/" + date + ".csv", index=False)