-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeocode.py
104 lines (84 loc) · 2.95 KB
/
geocode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import time
import requests
import concurrent.futures
import pandas as pd
import math
http = requests.Session()
servers = ['localhost:2322']
# manages the request to the localhost/server running photon.jar
def geoRequest(loc, server):
start = time.perf_counter()
params = {'q': loc, 'limit': 30}
url = f'http://{server}/api'
r = http.get(url, params=params,
headers={'user-agent': 'geocode-tester'})
r = r.json()
r.update({"location": loc})
finish = time.perf_counter()
print(f'geoRequest for {loc} \nfinished in {round(finish-start, 2)} seconds(s)')
return r
def geocode(loc, server):
# df to store the result
addressDF = pd.DataFrame(columns=[
'location',
'osm_id',
'osm_type',
'name',
'housenumber',
'street',
'postcode',
'city',
'state',
'country',
'osm_key',
'osm_value',
'lon',
'lat',
'message'
])
with concurrent.futures.ThreadPoolExecutor() as executor:
# we only want the "location"
# last 100 of the searches
# on average: 100 takes around 7 seconds
# loc = loc[-100:]
# use an executor map
results = executor.map(geoRequest, loc, [server for _ in loc])
for result in results:
for address in result['features']:
g = address['geometry']['coordinates']
p = address['properties']
# get the correct columns for lon and lat
g = {'lon': g[0], 'lat': g[1]}
# remove excess columns
p.pop('extent', None)
p.pop('type', None)
p.pop('district', None)
p.pop('county', None)
p.pop('countrycode', None)
p.pop('locality', None)
# join
values = {**p, **g, 'location': result['location'][0]}
addressToConcat = pd.DataFrame(data=values, index=[0])
addressDF = pd.concat([addressDF, addressToConcat])
# save as csv
addressDF.to_csv('addresses.csv', index=False,
header=True, encoding='utf-8')
# spliting array into chunks:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(lst, n):
return [lst[i:i + n] for i in range(0, len(lst), n)]
def main():
# read the searches.csv (contains all the queries we need)
df = pd.read_csv("searches")
allLoc = df.loc[:, ['query']].to_numpy()
eachLocArrSize = math.ceil(len(allLoc)/len(servers))
locSplit = chunks(allLoc, eachLocArrSize)
with concurrent.futures.ThreadPoolExecutor() as executor:
results = executor.map(geocode, locSplit, servers)
for result in results:
result
if __name__ == '__main__':
start = time.perf_counter()
main()
finish = time.perf_counter()
print(f'finished in {round(finish-start, 2)} seconds(s)')