#!/usr/bin/python# -*- coding: UTF-8 -*-from bs4 import BeautifulSoupimport pymysqlimport refrom urllib import urlopenimport requestsimport jsonimport mathprovinceList = [ {"adcode": "110000", "name": "北京市"}, {"adcode": "120000", "name": "天津市"}, {"adcode": "130000", "name": "河北省"}, {"adcode": "140000", "name": "山西省"}, {"adcode": "150000", "name": "内蒙古自治区"}, {"adcode": "210000", "name": "辽宁省"}, {"adcode": "220000", "name": "吉林省"}, {"adcode": "230000", "name": "黑龙江省"}, {"adcode": "310000", "name": "上海市"}, {"adcode": "320000", "name": "江苏省"}, {"adcode": "330000", "name": "浙江省"}, {"adcode": "340000", "name": "安徽省"}, {"adcode": "350000", "name": "福建省"}, {"adcode": "360000", "name": "江西省"}, {"adcode": "370000", "name": "山东省"}, {"adcode": "410000", "name": "河南省"}, {"adcode": "420000", "name": "湖北省"}, {"adcode": "430000", "name": "湖南省"}, {"adcode": "440000", "name": "广东省"}, {"adcode": "450000", "name": "广西壮族自治区"}, {"adcode": "460000", "name": "海南省"}, {"adcode": "500000", "name": "重庆市"}, {"adcode": "510000", "name": "四川省"}, {"adcode": "520000", "name": "贵州省"}, {"adcode": "530000", "name": "云南省"}, {"adcode": "540000", "name": "西藏自治区"}, {"adcode": "610000", "name": "陕西省"}, {"adcode": "620000", "name": "甘肃省"}, {"adcode": "630000", "name": "青海省"}, {"adcode": "640000", "name": "宁夏回族自治区"}, {"adcode": "650000", "name": "新疆维吾尔自治区"}, {"adcode": "710000", "name": "台湾省"}, {"adcode": "810000", "name": "香港特别行政区"}, {"adcode": "820000", "name": "澳门特别行政区"}, ]def saveProvince(): db = pymysql.connect(host = '127.0.0.1', port = 3306, user = 'json', passwd = '123456', db = 'youxia', charset="utf8") cursor = db.cursor() sql = 'insert into gd_area(code,name) values(%s,%s)' for province in provinceList: cursor.execute(sql,(province['adcode'],province['name'])) db.commit() # print(province['name']); cursor.close() db.close()# saveProvince();def connectMysql(): db = pymysql.connect(host = '127.0.0.1', port = 3306, user = 'json', passwd = '123456', db = 'youxia', charset="utf8") cursor = db.cursor() sql_select = "select version()" cursor.execute(sql_select) data = cursor.fetchone() print("DB version is : %s" % data) cursor.close() db.close()# connectMysql()def queryMysql(): db = pymysql.connect(host = '127.0.0.1', port = 3306, user = 'json', passwd = '123456', db = 'youxia', charset="utf8") cursor = db.cursor() sql = " select * from gd_area "; cursor.execute(sql) data = cursor.fetchone() print(data[2]) cursor.close() db.close()# queryMysql()def parseHtml(): db = pymysql.connect(host = '127.0.0.1', port = 3306, user = 'json', passwd = '123456', db = 'youxia', charset="utf8") cursor = db.cursor() sql_select=" select * from gd_area where name =%s" sql_update=" insert into gd_area(code,name,level,parentId) values(%s,%s,2,%s) " path = '/Users/chanming/Desktop/a.html' htmlfile = open(path, 'r') htmlhandle = htmlfile.read() htmlhandle=bs_preprocess(htmlhandle) soup = BeautifulSoup(htmlhandle, 'lxml') pdl = soup.find('dl') for child in pdl.children: if child and child.name=='dt': cursor.execute(sql_select,(child.get_text()[0:-1])) result = cursor.fetchone() # print(result[2]) # print(child.get_text()[0:-1]) # soup. next=child.nextSibling; # if next: # next=child.nextSibling; # print(next.name) liList = next.children; for perLi in liList: # print(result[1]) cursor.execute(sql_update,(perLi.get('adcode'),perLi.get_text(),result[1])) # print(perLi.get("adcode")) db.commit() cursor.close() db.close()def bs_preprocess(html): """remove distracting whitespaces and newline characters""" pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE) html = re.sub(pat, '', html) # remove leading and trailing whitespaces html = re.sub('\n', ' ', html) # convert newlines to spaces # this preserves newline delimiters html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags return html # parseHtml() detailUrl="https://www.amap.com/detail/get/detail?id=%s"schoolUrl="https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=13&city={}&geoobj=109.703841%7C38.259262%7C109.868675%7C38.302393&keywords=%E5%B9%BC%E5%84%BF%E5%9B%AD"def pullschoolInfo(): s=schoolUrl.format(120000) # print(s) # res=urlopen(s) r = requests.get(s) print(r.json())# pullschoolInfo()def readJsonFile(): db = pymysql.connect(host = '127.0.0.1', port = 3306, user = 'json', passwd = '123456', db = 'youxia', charset="utf8") cursor = db.cursor() path="/Users/chanming/Desktop/dis.json" jsonStr= open(path ,'r') jsonObject = json.load(jsonStr) disList = jsonObject['districts'] saveDis(disList,0,cursor,db) db.commit() cursor.close() db.close() # print(jsonObject)def saveDis(disList,parent,cursor,db): update_sql = "insert into gd_districts (name,citycode,adcode,parent) values( %s,%s,%s,%s)" if disList: for dis in disList: if not dis['citycode'] : dis['citycode']='' # print(dis['citycode']) cursor.execute(update_sql,(dis['name'],dis['citycode'],dis['adcode'],parent)) db.commit() saveDis(dis['districts'],dis['adcode'],cursor,db) # readJsonFile()def overWork(): url="https://restapi.amap.com/v3/place/text?s=rsv3&children=&key=e035d1ccf41f290d31b2251ee9fd6deb&offset=50&page={}&city={}&citylimit=true&extensions=all&language=undefined&callback=jsonp_381535_&platform=JS&logversion=2.0&appname=file%3A%2F%2F%2FUsers%2Fchanming%2FDesktop%2Fb.html&csid=D740A44A-DE94-4ECB-9478-1A811AF84359&sdkversion=1.4.10&keywords=%E5%B9%BC%E5%84%BF%E5%9B%AD" db = pymysql.connect(host = '127.0.0.1', port = 3306, user = 'json', passwd = '123456', db = 'youxia', charset="utf8") cursor = db.cursor() query_sql = " select * from gd_districts where LEVEL=2 " cursor.execute(query_sql) sqlList = cursor.fetchall() headers = {'Content-Type': 'application/json'} f=open('/Users/chanming/Desktop/c.txt','a') for sql in sqlList: requrl=url.format(1,sql[2]) result= requests.get(requrl,headers) index=result.text.index('(') jsonObject = json.loads(result.text[index+1:-1]) pages=int(math.ceil(float(jsonObject['count'])/50)) parsepois(jsonObject['pois'],cursor,db) # print(pages) for i in range(1,pages): # print(i+1) requrl=url.format(i+1,sql[2]) result= requests.get(requrl,headers) index=result.text.index('(') jsonObject = json.loads(result.text[index+1:-1]) parsepois(jsonObject['pois'],cursor,db) f.write(sql[2]+",") print(sql[2]) f.close()def parsepois(poisList,cursor,db): update_sql = "insert into gd_kids(idstr,name,address,tel,website,email,pcode,pname,citycode,cityname,adcode,adname)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) " if poisList: for poi in poisList: if not poi['address']: poi['address']='' if not poi["tel"]: poi["tel"]="" if not poi['website']: poi['website']="" if not poi['email']: poi['email']="" if not poi['pcode']: poi['pcode']="" if not poi['pname']: poi['pname']='' if not poi['citycode']: poi['citycode']='' if not poi['adcode']: poi['adcode']='' if not poi['adname']: poi['adname']='' cursor.execute(update_sql,(poi['id'],poi['name'],poi['address'],poi['tel'],poi['website'],poi['email'],poi['pcode'],poi['pname'],poi['citycode'],poi['cityname'],poi['adcode'],poi['adname'])) db.commit()overWork() # print "你好,世界";