-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_scraping.py
167 lines (131 loc) · 6.52 KB
/
web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
Script: web_scraping.py
Description:
This script automates the process of downloading data from the WHO Global Tuberculosis Programme website
and performs various data processing techniques on the downloaded data.
Functions:
1. download_data(url, download_directory)
- Downloads data from a specified URL and saves it to a designated directory.
- Parameters:
- url (str): The URL of the webpage to download the data from.
- download_directory (str): The directory where the downloaded file will be saved.
2. data_processing(csv_file_path, output_directory)
- Applies various data processing techniques on the downloaded CSV file.
- Parameters:
- csv_file_path (str): The path to the downloaded CSV file.
- output_directory (str): The directory where the processed data will be saved.
- Data Processing Techniques:
1. Handling Missing Values:
- Drops rows with any missing values.
2. Data Transformation:
- Converts string columns to lowercase.
3. Data Aggregation:
- Groups by 'country' and calculates the mean of numeric columns.
4. Data Filtering:
- Filters rows based on a condition (example condition provided in the code).
Main Block:
- Paths to the desired browser's WebDriver, the download directory, and the output directory are specified.
- Data is downloaded using the download_data function.
- CSV files are searched for in the download directory.
- If CSV files are found, data processing is performed using the data_processing function.
- Processed data is saved to CSV files in the output directory.
Note:
- Ensure that the paths to WebDriver and directories are correctly specified according to your system configuration.
- Modify the data processing techniques and conditions in the data_processing function as needed for your specific data requirements.
- This script assumes that there is only one CSV file in the download directory. If there are multiple files, additional logic may be required to handle them appropriately.
"""
# Code Begins
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
import time
import os
import pandas as pd
def download_data(url, download_directory):
"""
Downloads the data from the given URL and saves it to the specified directory.
Parameters:
url (str): The URL of the webpage to download the data from.
download_directory (str): The directory where the downloaded file will be saved.
"""
# Initialize Edge driver with download directory
options = webdriver.EdgeOptions()
prefs = {"download.default_directory": download_directory}
options.add_experimental_option("prefs", prefs)
service = Service(edge_driver_path)
driver = webdriver.Edge(service=service, options=options)
# Open the URL in the browser
driver.get(url)
# Find the download link based on the href attribute value
download_link = driver.find_element(By.CSS_SELECTOR, "a[href*='generateCSV.asp']")
# Click on the download link
download_link.click()
# Wait for the download to complete (adjust the time.sleep duration as needed)
time.sleep(5)
# Close the browser
driver.quit()
def data_processing(csv_file_path, output_directory):
"""
Performs various data processing techniques on the downloaded CSV file.
Parameters:
csv_file_path (str): The path to the downloaded CSV file.
output_directory (str): The directory where the processed data will be saved.
"""
# Reads the CSV file into a pandas DataFrame..
df = pd.read_csv(csv_file_path)
# Displays the original DataFrame..
print("Original DataFrame:")
print(df.head())
# NOW Lets perform few Data processing techniques..
# 1.Handling Missing Values
# Drop rows with any missing values
processed_df = df.dropna()
#2.Data Transformation
#Convert string columns to lowercase
string_columns = df.select_dtypes(include=['object']).columns
df[string_columns] = df[string_columns].apply(lambda x: x.str.lower())
# 3. Data Aggregation
# Group by 'country' and calculate the mean of numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns
aggregated_df = df.groupby('country')[numeric_columns].mean().reset_index()
# 4. Data Filtering
# Filter rows based on a condition
filtered_df = df[df['m_01'] > 100]
# Displaying the processed DataFrames
print("\nProcessed DataFrames:")
print("Processed DataFrame with missing values removed:")
print(processed_df.head())
print("\nProcessed DataFrame with string columns converted to lowercase:")
print(df.head())
print("\nAggregated DataFrame with mean values per country:")
print(aggregated_df.head())
print("\nFiltered DataFrame with 'm_01' greater than 100:")
print(filtered_df.head())
# Save the processed DataFrames to CSV files
processed_df.to_csv(os.path.join(output_directory, 'processed_data.csv'), index=False)
aggregated_df.to_csv(os.path.join(output_directory, 'aggregated_data.csv'), index=False)
filtered_df.to_csv(os.path.join(output_directory, 'filtered_data.csv'), index=False)
if __name__ == "__main__":
# this will the path Path to the desired browser's WebDriver..
edge_driver_path = r'C:\msedgedriver.exe'
# URL of the WHO's Global Tuberculosis Programme data page..
url = "https://www.who.int/teams/global-tuberculosis-programme/data"
# Path to save the downloaded file..
download_directory = r'C:\Users\Admin\Desktop\InternCareer'
# Path to save the processed data..
output_directory = r'C:\Users\Admin\Desktop\InternCareer'
#Download data Function recall..
download_data(url, download_directory)
#List all files in the download directory..
files = os.listdir(download_directory)
# Filter files to find the CSV file based on the file extension and in our case it is a csv file..
csv_files = [file for file in files if file.endswith('.csv')]
#We check if any CSV files were found..
if csv_files:
# Assuming there's only one CSV file, use the first one we get..
csv_filename = csv_files[0]
csv_file_path = os.path.join(download_directory, csv_filename)
#Perform data processing
data_processing(csv_file_path, output_directory)
else:
print("No CSV files were found in the download directory..")