-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlanguage.py
185 lines (130 loc) · 7.1 KB
/
language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import load_data
def load_language_data():
# Your code to load data goes here
# Replace this with your data loading logic
english_data = pd.read_csv('english_data.csv')
french_data = pd.read_csv('french_data.csv')
chinese_data = pd.read_csv('chinese_data.csv')
spanish_data = pd.read_csv('spanish_data.csv')
arabic_data = pd.read_csv('arabic_data.csv')
german_data = 6
portuguese = 9
vietnamese = 1
japanese = 1
# Counting unique countries where Primary language is 'Yes'
english = english_data['Country'].nunique()
french = french_data['Country'].nunique()
chinese = chinese_data['Country'].nunique()
spanish = spanish_data['Country'].nunique()
arabic = arabic_data['Country'].nunique()
total_country = 195
# Count of different languages
language_counts = [english, french, chinese, spanish, arabic,
german_data, portuguese, vietnamese, japanese]
# all languages
languages = ['English', 'French', 'Chinese', 'Spanish', 'Arabic', 'German', 'Portuguese', 'Vietnamese', 'Japanese']
# number of total countries, list of language, total country number
return language_counts, languages, total_country
def analyze_language_data(data):
# Counting occurrences of each language
language_counts = data['Language'].value_counts()
# Total number of rows in the dataset
total_rows = len(data)
# Calculating the ratio for each language and converting to percentage
language_percentages = (language_counts / total_rows) * 100
# Filtering languages with percentages greater than 1%
filtered_percentages = language_percentages[language_percentages > 1]
# Returning the filtered language percentages as a list
return filtered_percentages.tolist()
def plot_language_comparison(language_counts, languages, total_country, dataset_percent):
# Calculate percentages
country_percentages = [count / total_country * 100 for count in language_counts] # percent of country that speak certain language / total country
# Define the width for each bar
bar_width = 0.35
# Set the positions for the bars
bar_positions_country = list(range(len(languages)))
bar_positions_language = [pos + bar_width for pos in bar_positions_country]
# Plotting the comparison side by side
plt.figure(figsize=(12, 6))
bars1 = plt.bar(bar_positions_country, country_percentages, width=bar_width, color='skyblue', label='Percentage of Country use It as Offical Language')
bars2 = plt.bar(bar_positions_language, dataset_percent, width=bar_width, color='orange', alpha=0.7, label='Percentage of Appearance of Language in Dateset')
plt.xlabel('Languages')
plt.ylabel('Percentage')
plt.title('Comparison of Country vs Language Percentage')
plt.xticks([pos + bar_width / 2 for pos in bar_positions_country], languages)
plt.legend()
for bar, percent in zip(bars1, country_percentages):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{percent:.2f}%', ha='center', va='bottom')
for bar, percent in zip(bars2, dataset_percent):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{percent:.2f}%', ha='center', va='bottom')
plt.tight_layout()
# plt.show()
def get_language_population(language):
population_data = pd.read_csv('language_speaking_population.csv')
if language == 'Arabic':
language_cate = population_data[population_data['Language'].str.contains('Arabic')]
# Calculate the sum of the 'Total speakers (L1+L2)' for Arabic languages
language_population = language_cate['Total speakers (L1+L2)'].sum()
else:
language_population = population_data[population_data['Language'] == language]['Total speakers (L1+L2)']
if isinstance(language_population, pd.Series):
language_population = language_population.iloc[0]
return language_population
def population():
total_amount = 8045311447
languages = ['English', 'French', 'Chinese', 'Spanish', 'Arabic', 'German', 'Portuguese', 'Vietnamese', 'Japanese']
language_populations = []
for language in languages:
language_speaker = get_language_population(language)
language_populations.append(language_speaker)
# print(language_populations)
return language_populations, languages, total_amount
def language_percentage(population_lst, total_population):
# population_data = pd.read_csv('language_speaking_population.csv')
# total_population = population_data['Total speakers (L1+L2)'].sum()
language_percentages = [(language_speaker/total_population) * 100 for language_speaker in population_lst]
# print(language_percentages)
return language_percentages
def plot_population_comparison(language_percent, languages, dataset_percent):
# Calculate percentages
# country_percentages = [count / total_country * 100 for count in language_counts] # percent of country that speak certain language / total country
# language_percentages = [count / sum(language_counts) * 100 for count in language_counts]
# Define the width for each bar
bar_width = 0.35
# Set the positions for the bars
bar_positions_country = list(range(len(language_percent)))
bar_positions_language = [pos + bar_width for pos in bar_positions_country]
# Plotting the comparison side by side
plt.figure(figsize=(12, 6))
bars1 = plt.bar(bar_positions_country, language_percent, width=bar_width, color='skyblue', label='Percentage of Language Speaker')
bars2 = plt.bar(bar_positions_language, dataset_percent, width=bar_width, color='orange', alpha=0.7, label='Percentage of Appearance of Language in Dataset')
plt.xlabel('Languages')
plt.ylabel('Percentage')
plt.title('Comparison of Speaker Population vs Language Percentage')
plt.xticks([pos + bar_width / 2 for pos in bar_positions_country], languages)
plt.legend()
# Adding numbers on top of the bars
for bar, percent in zip(bars1, language_percent):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{percent:.2f}%', ha='center', va='bottom')
for bar, percent in zip(bars2, dataset_percent):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{percent:.2f}%', ha='center', va='bottom')
plt.tight_layout()
plt.show()
def main():
# Loading data
data = load_data.load_data()
# Analyzing language data and get the language bigger than 1%
data_set_percent = analyze_language_data(data)
# number of total countries, list of language, total country number
language_counts, languages, total_country = load_language_data()
# Plotting the comparison
plot_language_comparison(language_counts, languages, total_country, data_set_percent)
# number of total population for each language, list of language, total country number
language_populations, languages, total_amount = population()
population_percent = language_percentage(language_populations, total_amount)
plot_population_comparison(population_percent, languages, data_set_percent)
if __name__ == "__main__":
main()