import pandas as pd
dict = {'Origin Region': [1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 5.0],
'Origin Latitude': [-36.45875, -36.24879, -36.789456, -38.14789, -36.15963, -36.159455, -36.2345, -36.12745],
'Origin Longitude': [145.14563, 145.15987, 145.87456, 146.75314, 145.75483, 145.78458, 145.123654, 145.11111]}
df = pd.DataFrame(dict)
centres_dict = {'Origin Region': [1.0, 2.0, 3.0, 4.0, 5.0],
'Origin Latitude': [-36.25361, -36.78541, -36.74859, -38.74123, -36.14538],
'Origin Longitude': [145.12345, 145.36241, 145.12365, 146.75314, 145.75483]}
centres_df = pd.DataFrame(centres_dict)
grouped_region = df.groupby('Origin Region')
for region, region_group in grouped_region:
outliers = region_group[['Origin Latitude', 'Origin Longitude']].where((region_group['Origin Latitude'] < -36.15))
outliers.dropna(inplace=True)
print(outliers)
if(~outliers.empty):
for index, outlier_value in outliers.iterrows():
for another_index, centre_value in centres_df.iterrows():
a = outlier_value['Origin Longitude']
b = outlier_value['Origin Latitude']
c = centres_df['Origin Longitude']
d = centres_df['Origin Latitude']
#find distance using the above and then find minimum distance
I am trying to loop through each group of a dataframe (df), then filter values in each group based on some condition and perform distance computation between between each of these filtered values (outliers) and all the values in another dataframe (centres_df).
I have the data in dataframes, should i convert them into arrays and then use scipy cdist to calculate distances ? or simply use a loop and use my own distance calculation function ? I am not sure what is the best way to do this. Or maybe use apply and call my own distance function ?