I have a pandas dataframe with the following relevant features: location (city), lat, long.
I want to add to this pandas dataframe the following columns in the most efficient manner:
- Distance to closest hospital
- Distance to closest train station
- Etc
My dataframe looks like this:
Column1 postal_code city_name type_of_property price number_of_rooms house_area fully_equipped_kitchen open_fire terrace garden surface_of_the_land number_of_facades swimming_pool state_of_the_building lattitude longitude province region
13380 13380 1785 Brussegem 1 235000 1 71 1 0 1 0 0 4 0 good 4.265887 50.927771 Brabant flamand Flandre
21135 21135 8630 Bulskamp 1 545000 3 191 1 0 0 0 0 2 0 as new 2.643449 51.044461 Flandre-Occidentale Flandre
5002 5002 4287 Lincent 0 97500 3 90 0 0 0 0 260 4 0 to renovate 5.031817 50.711613 Liège Wallonie
9544 9544 8400 Oostende 0 130000 3 119 1 0 0 1 71 2 0 to renovate 2.920327 51.230318 Flandre-Occidentale Flandre
38923 38923 4830 Limbourg 0 149000 2 140 1 0 0 0 15 2 0 to be done up 5.940299 50.612276 Liège Walloni
I found this python package which allows me to find places near by:
https://github.com/slimkrazy/python-google-places
So I created 2 function:
To calculate distance between 2 points (geodesic distance)
import geopy from googleplaces import GooglePlaces, types, lang, ranking import geopy.distance def geodesicdistance(start,end): return print(geopy.distance.geodesic(start, end).km)
Function to get nearby places to a start point (hospitals, train stations, etc) It all depends on the type parameter
def getplace(start, location, type): YOUR_API_KEY = '' google_places = GooglePlaces(YOUR_API_KEY) query_result = google_places.nearby_search( lat_lng=start, location=location, radius=500, types=[type], rankby=ranking.DISTANCE ) return (query_result.places[0].geo_location['lng'], query_result.places[0].geo_location['lat'])
the type is an enum with the following relevant values for me:
TYPE_BUS_STATION, TYPE_HOSPITAL, TYPE_AIRPORT, TYPE_BAKERY,TYPE_CITY_HALL, TYPE_FIRE_STATION, TYPE_PHARMACY, TYPE_POLICE,TYPE_SUBWAY_STATION,TYPE_TRAIN_STATION,TYPE_UNIVERSITY
11 types.
Basically I want to create 11 new dynamic columns as: Distance to closes bus station, Distance to closes hospital, Distance to closest airport, etc, etc.
The part I am missing and I dont have a clue how to do it, its how to iterate on the pandas dataframe and per each row, call the functions 11 times and store it as a new value on the corresponding column
Update: I tried the code below but still got an error. (I changed location by city_name)
import geopy
from googleplaces import GooglePlaces, types, lang, ranking
import geopy.distance
# Corrected geodesicdistance so it returns correct result rather than None
def geodesicdistance(start,end):
result = geopy.distance.geodesic(start, end).km
#print(result) # uncommment to print
return result
# Refactored getplace to getplaces to provide distance to items in type_list
def getplaces(start,
location,
types):
'''
Get distance to nearby item based for types
'''
YOUR_API_KEY = ''
google_places = GooglePlaces('')
# Compute the closest distance for each type
start_lat_lng = (start['lat'], start['lng'])
distances = []
for type_ in types:
# Find closest place for type_
query_result = google_places.nearby_search(
lat_lng = start, # A dict containing the following keys: lat, lng (default None)
location = location, # A human readable location, e.g 'London, England' (default None)
radius = 500,
types = [type_],
rankby = ranking.DISTANCE
)
# 0th element hast lat/lng for closest since we ranked by distance
end_lat_lng = query_result.places[0].geo_location['lat'], query_result.places[0].geo_location['lng']
# Add distance to closest to list by type
distances.append(geodesicdistance(start_lat_lng , end_lat_lng ))
return distances
def min_dist(lattitude, longitude, location, types):
'''
Get distances for places closest to latitude/longitude
latitude/longitude - position to find nearby items from
location - human-readable name of the position
types - list of places to find by type
'''
start = {'lat': lattitude,
'lng': longitude}
return getplaces(start, location, types)
# List of 11 types used from googleplaces.types
types_list = [types.TYPE_BUS_STATION, types.TYPE_HOSPITAL, types.TYPE_AIRPORT, types.TYPE_BAKERY,
types.TYPE_CITY_HALL, types.TYPE_FIRE_STATION, types.TYPE_PHARMACY,
types.TYPE_POLICE,types.TYPE_SUBWAY_STATION, types.TYPE_TRAIN_STATION, types.TYPE_UNIVERSITY]
# Create Dataframe whose columns have the distances to closest item by type
closest_df = df.apply(lambda row: pd.Series(min_dist(row['lattitude'],
row['longitude'],
row['city_name'],
types_list),
index = types_list),
axis = 'columns',
result_type ='expand')
# Concatenate the two dfs columnwise
final_df = pd.concat([df, closest_df], axis = 1)
but I have this error:
IndexError Traceback (most recent call last)
/home/azureuser/cloudfiles/code/Users/levm38/LightGBM/Tests/featureengineering.ipynb Cell 2 in <cell line: 63>()
58 types_list = [types.TYPE_BUS_STATION, types.TYPE_HOSPITAL, types.TYPE_AIRPORT, types.TYPE_BAKERY,
59 types.TYPE_CITY_HALL, types.TYPE_FIRE_STATION, types.TYPE_PHARMACY,
60 types.TYPE_POLICE,types.TYPE_SUBWAY_STATION, types.TYPE_TRAIN_STATION, types.TYPE_UNIVERSITY]
62 # Create Dataframe whose columns have the distances to closest item by type
---> 63 closest_df = df.apply(lambda row: pd.Series(min_dist(row['lattitude'],
64 row['longitude'],
65 row['city_name'],
66 types_list),
67 index = types_list),
68 axis = 'columns',
69 result_type ='expand')
71 # Concatenate the two dfs columnwise
72 final_df = pd.concat([df, closest_df], axis = 1)
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/pandas/core/frame.py:8845, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8834 from pandas.core.apply import frame_apply
8836 op = frame_apply(
8837 self,
8838 func=func,
(...)
8843 kwargs=kwargs,
8844 )
...
---> 36 end_lat_lng = query_result.places[0].geo_location['lat'], query_result.places[0].geo_location['lng']
38 # Add distance to closest to list by type
39 distances.append(geodesicdistance(start_lat_lng , end_lat_lng ))
IndexError: list index out of range
Update 2:
The code runs but I always get NaNs, even if I increase the ratious to 5000 mt instead of 500, I always get NaNs in all columns
The only thing I did is change the location by city_name, I hardcoded ,+"Belgium" on the location parameter to the cityname.
See dataset here: https://raw.githubusercontent.com/Joffreybvn/real-estate-data-analysis/master/data/clean/belgium_real_estate.csv
Updated Code:
import geopy
from googleplaces import GooglePlaces, types, lang, ranking
import geopy.distance
import numpy as np
import pandas as pd
# Corrected geodesicdistance so it returns correct result rather than None
def geodesicdistance(start,end):
result = geopy.distance.geodesic(start, end).km
#print(result) # uncommment to print
return result
# Refactored getplace to getplaces to provide distance to items in type_list
def getplaces(start,
location,
types):
'''
Get distance to nearby item based for types
'''
YOUR_API_KEY = ''
google_places = GooglePlaces(YOUR_API_KEY)
# Compute the closest distance for each type
start_lat_lng = (start['lat'], start['lng'])
distances = []
for type_ in types:
# Find closest place for type_
query_result = google_places.nearby_search(
lat_lng = start, # A dict containing the following keys: lat, lng (default None)
location = location +",Belgium", # A human readable location, e.g 'London, England' (default None)
radius = 5000,
types = [type_],
rankby = ranking.DISTANCE
)
# 9/1/2022 --added try/except block to handle 0 responses
try:
# 0th element hast lat/lng for closest since we ranked by distance
end_lat_lng = query_result.places[0].geo_location['lat'], query_result.places[0].geo_location['lng']
# Add distance to closest to list by type
distances.append(geodesicdistance(start_lat_lng , end_lat_lng ))
except IndexError:
distances.append(np.nan) # did not return a value
return distances
def min_dist(latitude, longitude, location, types):
'''
Get distances for places closest to latitude/longitude
latitude/longitude - position to find nearby items from
location - human-readable name of the position
types - list of places to find by type
'''
start = {'lat': latitude, # spelling correction 9/1/2022
'lng': longitude}
return getplaces(start, location, types)
dfsmall= df.sample(10)
types_list = [types.TYPE_BUS_STATION, types.TYPE_HOSPITAL, types.TYPE_AIRPORT, types.TYPE_BAKERY,
types.TYPE_CITY_HALL, types.TYPE_FIRE_STATION, types.TYPE_PHARMACY,
types.TYPE_POLICE,types.TYPE_SUBWAY_STATION, types.TYPE_TRAIN_STATION, types.TYPE_UNIVERSITY]
# Create Dataframe whose columns have the distances to closest item by type
closest_df = dfsmall.apply(lambda row: pd.Series(min_dist(row['lattitude'],
row['longitude'],
row['city_name'],
types_list),
index = types_list),
axis = 'columns',
result_type ='expand')
# Concatenate the two dfs columnwise
final_df = pd.concat([dfsmall, closest_df], axis = 1)
print('Final df')
display(final_df)