通过小区名称利用百度api可以获取小区的地址以及经纬度,但是由于api返回的值中的地址形式不同,所以可以首先利用小区名称进行一轮爬虫,获取小区的经纬度,然后再利用经纬度Reverse到小区的结构化的地址。另外小区名称如果是'...号‘,可以在爬虫开始之前在'号‘之后加一个'院‘,得到的精确度更高。这次写到程序更加便于二次利用,只需要给程序传递一个dataframe就可以坐等结果了。现在程序已经写好了,就等接下来在工作中看看效果如何了。
class GetAddressInfo: def __init__(self,df): import pandas assert type(df) == pandas.core.frame.DataFrame and ('city' in df.columns) and ('name' in df.columns),/ 'The dataframe is not vailid' from bs4 import BeautifulSoup from urllib import request import re import pandas as pd import numpy as np import urllib.parse as urp self.__data__ = df def get_address(self): import numpy as np self.__data__['小区经度'] = np.nan self.__data__['小区纬度'] = np.nan self.__data__['小区地址'] = np.nan for i in self.__data__.index: self.__data__.loc[i,'小区纬度'],self.__data__.loc[i,'小区经度'],self.__data__.loc[i,'小区地址'] =/ self.__get_neigbour_address__(self.__data__.loc[i,'name'],/ self.__data__.loc[i,'city']) return self.__data__ def __lat__(self,res): try: return pd.to_numeric(re.findall('"lat":(.*)',res)[0].split(',')[0]) except: return 0 def __lng__(self,res): try: return pd.to_numeric(re.findall('"lng":(.*)',res)[0]) except: return 0 def __address__(self,res): try: return re.findall('"address":"(.*)",',res)[0] except: return 'None' def __get_neigbour_address__(self,name,city): my_ak = ##替换自己的ak qurey = urp.quote(name) tag = urp.quote('住宅区') try: url = 'http://api.map.baidu.com/place/v2/search?query='+qurey+'&tag='+tag+'®ion='+urp.quote(city)+'&output=json&ak='+my_ak req = request.urlopen(url) res = req.read().decode() lat = self.__lat__(res) lng = self.__lng__(res) address = self.__address__(res) return lat,lng,address except: return 0,0,'None' class ReverseGetAddress: def __init__(self,data): assert ('小区纬度' in data.columns) and ('小区经度' in data.columns) and ('name' in data.columns),/ 'The DataFrame is not vailid' from bs4 import BeautifulSoup from urllib import request import re import pandas as pd import numpy as np import urllib.parse as urp self.__data__ = data def __get_address1__(self,url): try: req = request.urlopen(url) res = req.read().decode() address = re.findall('address":"(.*?)"',res)[0] return address except: return 'None1' def __to_string__(self,arr): return str(arr) def __get_address2__(self): my_ak = ##替换自己的Ak base_url1 = 'http://api.map.baidu.com/geocoder/v2/?callback=renderReverse' base_url2 = '&location=' base_url3 = '&pois=0&radius=1&output=json&pois=1&ak=' url = base_url1+base_url2+self.__data__['小区纬度'].apply(self.__to_string__)+','/ +self.__data__['小区经度'].apply(self.__to_string__)+base_url3+my_ak return url def get_address(self): url = self.__get_address2__() self.__data__['小区地址'] = url.apply(self.__get_address1__) return self.__data__