You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi,
I have a koalas dataframe with age and income and I calculated Zscore on age and income and then norms is calculated using age_zscore and income_zscore(new column name is sq_dist). Then I tried to do an idxmin on the new column, but its not giving the minimum value.
I did the same operations on a Pandas dataframe, but it gives the minimum value .
Please find attached the notebook for step by step operations I performed.
cmd1
import databricks.koalas as ks
import pandas as pd
import random
cmd2
#Create Sample dataframe in Koalas
df = ks.DataFrame.from_dict({
'Age': [random.randint(0, 100000) for i in range(100000)],
'Income': [random.randint(0, 100000) for i in range(100000)]
})
print(df.head(5))
cmd3
import scipy.stats as stats
import numpy as np
ks.set_option('compute.ops_on_diff_frames', True)
df['Income_zscore'] = ks.Series(stats.zscore(df['Income'].to_numpy()))
df['Age_zscore'] = ks.Series(stats.zscore(df['Age'].to_numpy()))
df['sq_dist'] = [np.linalg.norm(i) for i in df[['Income_zscore','Age_zscore']].to_numpy()]
ks.set_option('compute.ops_on_diff_frames', False)
cmd4
#display(df)
cmd5
#calculate min of sq_dist
minindex=df['sq_dist'].idxmin()
minindex
cmd6
#display min value of sq_dist
df['sq_dist'].iloc[minindex]
Hi,
I have a koalas dataframe with age and income and I calculated Zscore on age and income and then norms is calculated using age_zscore and income_zscore(new column name is sq_dist). Then I tried to do an idxmin on the new column, but its not giving the minimum value.
I did the same operations on a Pandas dataframe, but it gives the minimum value .
Please find attached the notebook for step by step operations I performed.
cmd1
import databricks.koalas as ks
import pandas as pd
import random
cmd2
#Create Sample dataframe in Koalas
df = ks.DataFrame.from_dict({
'Age': [random.randint(0, 100000) for i in range(100000)],
'Income': [random.randint(0, 100000) for i in range(100000)]
})
print(df.head(5))
cmd3
import scipy.stats as stats
import numpy as np
ks.set_option('compute.ops_on_diff_frames', True)
df['Income_zscore'] = ks.Series(stats.zscore(df['Income'].to_numpy()))
df['Age_zscore'] = ks.Series(stats.zscore(df['Age'].to_numpy()))
df['sq_dist'] = [np.linalg.norm(i) for i in df[['Income_zscore','Age_zscore']].to_numpy()]
ks.set_option('compute.ops_on_diff_frames', False)
cmd4
#display(df)
cmd5
#calculate min of sq_dist
minindex=df['sq_dist'].idxmin()
minindex
cmd6
#display min value of sq_dist
df['sq_dist'].iloc[minindex]
cmd7
df.to_spark().createOrReplaceTempView("koalastable")
cmd8
%sql
select min(sq_dist) from koalastable -- THis doesnt match with the value we got in cmd6
cmd9
#do same operations with Pandas
df_spark = df.to_spark()
stats_array = np.array(df_spark.select('Age', 'Income').collect())
normalized_data = stats.zscore(stats_array, axis=0)
df_pd = pd.DataFrame(data=normalized_data, columns=['Age', 'Income'])
df_pd['sq_dist'] = [np.linalg.norm(i) for i in normalized_data]
df_pd.head(5)
cmd10
minindex_pd=df_pd['sq_dist'].idxmin()
minindex_pd
cmd11
#minimum of sq_dist using Koalas
df_pd['sq_dist'].iloc[minindex_pd]
cmd12
spark.createDataFrame(df_pd).createOrReplaceTempView("pandastable")
cmd13
%sql
select min(sq_dist) from pandastable -- This match with the value we got in cmd11
The text was updated successfully, but these errors were encountered: