Get the nearest distance with two geodataframe in pandas

I think it's quite difficult to find a solution with a time complexity better than O(m·n), where m and n are the sizes of city1 and city2. Keeping the distance comparison (the only O(m·n) operation) simple, and taking advantage of the vectorized operations provided by numpy and pandas, speed should not be a problem for any reasonable input size.

The idea is that, to compare distances on a sphere, you can compare the distances between the points in 3D. The closest city is also the closest one passing through the sphere. Furthermore, you normally take square roots to calculate distances, but if you only need to compare them, you can avoid the square roots.

from geopy.distance import distance as dist
import numpy as np
import pandas as pd

def find_closest(lat1, lng1, lat2, lng2):
    def x_y_z_of_lat_lng_on_unit_sphere(lat, lng):
        rad_lat, rad_lng = np.radians(lat), np.radians(lng)
        sin_lat, sin_lng = np.sin(rad_lat), np.sin(rad_lng)
        cos_lat, cos_lng = np.cos(rad_lat), np.cos(rad_lng)
        return cos_lat * cos_lng, cos_lat * sin_lng, sin_lat
    x1, y1, z1 = x_y_z_of_lat_lng_on_unit_sphere(lat1, lng1)
    x2, y2, z2 = x_y_z_of_lat_lng_on_unit_sphere(lat2, lng2)
    return pd.Series(map(lambda x, y, z:
                         ((x2-x)**2 + (y2-y)**2 + (z2-z)**2).idxmin(),
                         x1, y1, z1))

city1 = [{"City":"Tokyo",    "Ctry":"JP", "Latitude": 35.68972, "Longitude": 139.69222},
         {"City":"Pretoria", "Ctry":"ZA", "Latitude":-25.71667, "Longitude": 28.28333},
         {"City":"London",   "Ctry":"GB", "Latitude": 51.50722, "Longitude": -0.12574}]
city2 = [{"City":"Seattle",  "Ctry":"US", "Latitude": 47.60972, "Longitude":-122.33306},
         {"City":"Auckland", "Ctry":"NZ", "Latitude":-36.84446, "Longitude": 174.76364}]
city1df = pd.DataFrame(city1)
city2df = pd.DataFrame(city2)

closest = find_closest(city1df.Latitude, city1df.Longitude, city2df.Latitude, city2df.Longitude)

resultdf = city1df.join(city2df, on=closest, rsuffix='2')
km = pd.Series(map(lambda latlng1, latlng2: round(dist(latlng1, latlng2).km),
                   resultdf[['Latitude',  'Longitude' ]].to_numpy(),
                   resultdf[['Latitude2', 'Longitude2']].to_numpy()))
resultdf['Distance'] = km
print(resultdf.to_string())
#        City Ctry  Latitude  Longitude     City2 Ctry2  Latitude2  Longitude2  Distance
# 0     Tokyo   JP  35.68972  139.69222   Seattle    US   47.60972  -122.33306      7715
# 1  Pretoria   ZA -25.71667   28.28333  Auckland    NZ  -36.84446   174.76364     12245
# 2    London   GB  51.50722   -0.12574   Seattle    US   47.60972  -122.33306      7723

Note that any solution that uses latitude and longitude as if they were Cartesian coordinates is wrong, because moving toward the poles the meridians (lines of equal longitude) get closer to each other.


This solution probably isn't the quickest way to solve your problem, but I believe it will do the trick.

#New dataframe is basicly a copy of first but with more columns
gcity3df = gcity1df.copy()
gcity3df["Nearest"] = None
gcity3df["Distance"] = None

#For each city (row in gcity3df) we will calculate the nearest city from gcity2df and 
fill the Nones with results

for index, row in gcity3df.iterrows():
    #Setting neareast and distance to None, 
    #we will be filling those variables with results

    nearest = None
    distance = None
    for df2index, df2row in gcity2df.iterrows():
        d = row.geometry.distance(df2row.geometry)
        #If df2index city is closer than previous ones, replace nearest with it
        if distance is None or d < distance:
            distance = d
            nearest = df2row.City 
    #In the end we appends the closest city to gdf
    gcity3df.at[index, "Nearest"] = nearest
    gcity3df.at[index, "Distance"] = distance

If you need to work on meters and not degrees, you can always reproject your layer (it will also erase the mistake that Walter is meantioning). You can do it by gcity3df = gcity3df.to_crs({'init': 'epsg:XXXX'}) where XXXX is epsg code for crs being used in your world region.


Firstly, I merge two data frames by cross join. And then, I found distance between two points using map in python. I use map, because most of the time it is much faster than apply, itertuples, iterrows etc. (Reference: https://stackoverflow.com/a/52674448/8205554)

Lastly, I group by data frame and fetch minimum values of distance.

Here are libraries,

import pandas as pd
import geopandas
import geopy.distance
from math import radians, cos, sin, asin, sqrt

Here are used functions,

def dist1(p1, p2):
    lon1, lat1, lon2, lat2 = map(radians, [p1.x, p1.y, p2.x, p2.y])

    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 

    return c * 6373

def dist2(p1, p2):
    lon1, lat1, lon2, lat2 = map(radians, [p1[0], p1[1], p2[0], p2[1]])

    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 

    return c * 6373

def dist3(p1, p2):
    x = p1.y, p1.x
    y = p2.y, p2.x

    return geopy.distance.geodesic(x, y).km

def dist4(p1, p2):
    x = p1[1], p1[0]
    y = p2[1], p2[0]

    return geopy.distance.geodesic(x, y).km

And data,

city1 = [
  {
    'City': 'Buenos Aires',
    'Country': 'Argentina',
    'Latitude': -34.58,
    'Longitude': -58.66
  },
  {
    'City': 'Brasilia',
    'Country': 'Brazil',
    'Latitude': -15.78,
    'Longitude': -70.66
  },
  {
    'City': 'Santiago',
    'Country': 'Chile ',
    'Latitude': -33.45,
    'Longitude': -70.66
  }
]

city2 = [
  {
    'City': 'Bogota',
    'Country': 'Colombia ',
    'Latitude': 4.6,
    'Longitude': -74.08
  },
  {
    'City': 'Caracas',
    'Country': 'Venezuela',
    'Latitude': 10.48,
    'Longitude': -66.86
  }
]


city1df = pd.DataFrame(city1)
city2df = pd.DataFrame(city2)

Cross join with geopandas data frames,

gcity1df = geopandas.GeoDataFrame(
    city1df, 
    geometry=geopandas.points_from_xy(city1df.Longitude, city1df.Latitude)
)
gcity2df = geopandas.GeoDataFrame(
    city2df, 
    geometry=geopandas.points_from_xy(city2df.Longitude, city2df.Latitude)
)

# cross join geopandas
gcity1df['key'] = 1
gcity2df['key'] = 1
merged = gcity1df.merge(gcity2df, on='key')

math functions and geopandas,

# 6.64 ms ± 588 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit

# find distance
merged['dist'] = list(map(dist1, merged['geometry_x'], merged['geometry_y']))

mapping = {
    'City_x': 'City',
    'Country_x': 'Country',
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude',
    'geometry_x': 'geometry',
    'City_y': 'Nearest',
    'dist': 'Distance'
}

nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
nearest.rename(columns=mapping)[list(mapping.values())]

           City    Country  Latitude  Longitude                     geometry  \
2      Brasilia     Brazil    -15.78     -70.66  POINT (-70.66000 -15.78000)   
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)   
4      Santiago     Chile     -33.45     -70.66  POINT (-70.66000 -33.45000)   

  Nearest     Distance  
2  Bogota  2297.922808  
0  Bogota  4648.004515  
4  Bogota  4247.586882 

geopy and geopandas,

# 9.99 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit

# find distance
merged['dist'] = list(map(dist3, merged['geometry_x'], merged['geometry_y']))

mapping = {
    'City_x': 'City',
    'Country_x': 'Country',
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude',
    'geometry_x': 'geometry',
    'City_y': 'Nearest',
    'dist': 'Distance'
}

nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
nearest.rename(columns=mapping)[list(mapping.values())]

           City    Country  Latitude  Longitude                     geometry  \
2      Brasilia     Brazil    -15.78     -70.66  POINT (-70.66000 -15.78000)   
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)   
4      Santiago     Chile     -33.45     -70.66  POINT (-70.66000 -33.45000)   

  Nearest     Distance  
2  Bogota  2285.239605  
0  Bogota  4628.641817  
4  Bogota  4226.710978 

If you want to use pandas instead of geopandas,

# cross join pandas
city1df['key'] = 1
city2df['key'] = 1
merged = city1df.merge(city2df, on='key')

With math functions,

# 8.65 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit

# find distance
merged['dist'] = list(
    map(
        dist2, 
        merged[['Longitude_x', 'Latitude_x']].values, 
        merged[['Longitude_y', 'Latitude_y']].values
    )
)

mapping = {
    'City_x': 'City',
    'Country_x': 'Country',
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude',
    'City_y': 'Nearest',
    'dist': 'Distance'
}

nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
nearest.rename(columns=mapping)[list(mapping.values())]

           City    Country  Latitude  Longitude Nearest     Distance
2      Brasilia     Brazil    -15.78     -70.66  Bogota  2297.922808
0  Buenos Aires  Argentina    -34.58     -58.66  Bogota  4648.004515
4      Santiago     Chile     -33.45     -70.66  Bogota  4247.586882

With geopy,

# 9.8 ms ± 807 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit

# find distance
merged['dist'] = list(
    map(
        dist4, 
        merged[['Longitude_x', 'Latitude_x']].values, 
        merged[['Longitude_y', 'Latitude_y']].values
    )
)

mapping = {
    'City_x': 'City',
    'Country_x': 'Country',
    'Latitude_x': 'Latitude',
    'Longitude_x': 'Longitude',
    'City_y': 'Nearest',
    'dist': 'Distance'
}

nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
nearest.rename(columns=mapping)[list(mapping.values())]

           City    Country  Latitude  Longitude Nearest     Distance
2      Brasilia     Brazil    -15.78     -70.66  Bogota  2285.239605
0  Buenos Aires  Argentina    -34.58     -58.66  Bogota  4628.641817
4      Santiago     Chile     -33.45     -70.66  Bogota  4226.710978