본문 바로가기
Web & Mobile/Python

Lecture 89 - Python(9) url 관련 함수들, 데이터 크롤링, geocoding을 이용한 위치 검색

by Bennyziio 2019. 8. 5.
반응형

network 
        URL                     - url string 관련 클래스 
        URLConnection           - 접속 관련 클래스 
         
        - google geolocation xml(json) 

        - jstl (proxy) 

        => urllib 패키지 

        Socket

https://docs.python.org/ko/3/library/urllib.html

URLEx01.ex01 - urlparse

from urllib.parse import urlparse

url = urlparse('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=starwars')
print(url)

from urllib.parse import urlparse

url = urlparse('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=starwars')
print(url)
print(url.scheme)

from urllib.parse import urlparse

url = urlparse('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=starwars')
print(url)
print(url.scheme)
print(url.hostname)

from urllib.parse import urlparse

url = urlparse('https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=starwars')
print(url)
print(url.scheme)
print(url.hostname)
print(url.query)

URLEx01.ex02 - urlunparse

from urllib.parse import urlunparse

url = urlunparse(('http', 'www.exam.com', '/hello', '', '', ''))
print(url)

from urllib.parse import urlunparse

url = urlunparse(('http', 'www.exam.com', '/hello', 'params', 'a-b', 'flag'))
print(url)

URLEx01.ex03 - urlencode

from urllib.parse import urlencode

form = { 'name': 'jhpark', 'phone': '010-1111-2222'}
encform = urlencode(form)
print(encform)

from urllib.parse import urlencode

form = { 'name': '홍길동', 'phone': '010-1111-2222'}
encform = urlencode(form)
print(encform)

from urllib.parse import urlencode, parse_qs

form = { 'name': '홍길동', 'phone': '010-1111-2222'}
encform = urlencode(form)
print(encform)
#name=%ED%99%8D%EA%B8%B8%EB%8F%99&phone=010-1111-2222

qsform = parse_qs('name=%ED%99%8D%EA%B8%B8%EB%8F%99&phone=010-1111-2222')
print(qsform)

URLEx01.ex04 - urlopen

from urllib.request import urlopen

urldata = urlopen('https://m.naver.com')
print(urldata)

from urllib.request import urlopen

urldata = urlopen('https://m.naver.com')
print(urldata)
print(urldata.headers)

from urllib.request import urlopen

urldata = urlopen('https://m.naver.com')
print(urldata)
print(urldata.headers)

html = urldata.read();
print(html)

from urllib.request import urlopen

urldata = urlopen('https://m.naver.com')
print(urldata)
print(urldata.headers)

html = urldata.read();
print(html.decode('utf-8'))

URLEx01.ex05 - Request, urlopen : url 객체를 만들어서 차례대로 open하는 것

from urllib.request import Request, urlopen

url = 'https://m.naver.com'
req = Request(url)
urldata = urlopen(req)

html = urldata.read();
print(html.decode('utf-8'))

from urllib.request import Request, urlopen
from urllib.parse import urlencode

# https://search.naver.com/search.naver
# ?
# where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=%EC%8A%A4%ED%83%80%EC%9B%8C%EC%A6%88

url = 'https://search.naver.com/search.naver?'
querystring = {'where': 'nexearch', 'sm': 'top_hty', 'fbm': '1', 'ie': 'utf8', 'query': '스타워즈'}
req = Request(url + urlencode(querystring))
urldata = urlopen(req)

html = urldata.read();
print(html.decode('utf-8'))

위와 같이 HTTPError: HTTP Error 403 에러가 떴는데 이는 정상적인 브라우저로 명령을 준게 아니라서 방화벽이 발생 한 것이다

from urllib.request import Request, urlopen
from urllib.parse import urlencode

# https://search.naver.com/search.naver
# ?
# where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=%EC%8A%A4%ED%83%80%EC%9B%8C%EC%A6%88

url = 'https://search.naver.com/search.naver?'
querystring = {'where': 'nexearch', 'sm': 'top_hty', 'fbm': '1', 'ie': 'utf8', 'query': '스타워즈'}
req = Request(url + urlencode(querystring), headers= {'User-Agent': 'Mozilla/5.0'})
urldata = urlopen(req)

html = urldata.read();
print(html.decode('utf-8')

Mozilla 5.0버전으로 User-Agent를 사용한다고 속여서 headers에 포함시켜서 실행하게 되면 위와 같이 정상적으로 데이터를 받아올 수 있다.

다음 용도도 만들어 보자
Ex01.ex06 - User-Agent를 잘 적어줘야 한다. 개발자모드 활용

from urllib.request import Request, urlopen
from urllib.parse import urlencode

# https://search.daum.net/search
# ?
# w=tot&DA=YZR&t__nil_searchbox=btn&sug=&sugo=&q=%EC%8A%A4%ED%83%80%EC%9B%8C%EC%A6%88

url = 'https://search.daum.net/search?'
querystring = {'w': 'tot', 'DA': 'YZR', 't__nil_searchbox': 'btn', 'sug': '', 'sugo': '', 'q': '스타워즈'}
req = Request(url + urlencode(querystring), headers= {'User-Agent': 'Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'})

#키	값
# User-Agent	Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
urldata = urlopen(req)

html = urldata.read();
print(html.decode('utf-8'))

Internet Explorer

 

Chrome

 

geocoding
기존 구글에서 Geocoding API로
https://maps.googleapis.com/maps/api/geocode/json?address=경복궁
하면 경복궁을 지도에서 확인할 수 있었는데 지금은 과금정책으로 바뀌어 버렸다.

일반 사용자는 무료로 일단 사용은 가능하니 해보자(현재 구글 로그인 상태)

역이름을 입력 받으면 json이 출력되게 해보자
Ex01.ex07

from urllib.request import Request, urlopen
from urllib.parse import urlencode

# https://search.daum.net/search
# ?
# w=tot&DA=YZR&t__nil_searchbox=btn&sug=&sugo=&q=%EC%8A%A4%ED%83%80%EC%9B%8C%EC%A6%88

url = 'https://maps.googleapis.com/maps/api/geocode/json?'
dongname = '개봉역'
querystring = {'address': dongname, 'key': 'Geocoding API키 입력'}
req = Request(url + urlencode(querystring), headers= {'User-Agent': 'Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'})

#키	값
# User-Agent	Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
urldata = urlopen(req)

html = urldata.read();
print(html.decode('utf-8'))

결과값이 영어로 나오긴 하지만 위도, 경도등 비교해보면 똑같이 나온다.

한국말로 하는 방법은 headers안에 accept-language: ko-KR을 넣어준다.

from urllib.request import Request, urlopen
from urllib.parse import urlencode

# https://search.daum.net/search
# ?
# w=tot&DA=YZR&t__nil_searchbox=btn&sug=&sugo=&q=%EC%8A%A4%ED%83%80%EC%9B%8C%EC%A6%88

url = 'https://maps.googleapis.com/maps/api/geocode/json?'
dongname = '개봉역'
querystring = {'address': dongname, 'key': 'AIzaSyArHE1hXsBVVHAr1W1eUjLOp34W6hcybIU'}
req = Request(url + urlencode(querystring), headers= {'accept-language': 'ko-KR', 'User-Agent': 'Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'})

#키	값
# User-Agent	Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
urldata = urlopen(req)

html = urldata.read();
print(html.decode('utf-8'))

위치를 입력받아서 검색하게 해보자

from urllib.request import Request, urlopen
from urllib.parse import urlencode

# https://search.daum.net/search
# ?
# w=tot&DA=YZR&t__nil_searchbox=btn&sug=&sugo=&q=%EC%8A%A4%ED%83%80%EC%9B%8C%EC%A6%88
dongname = input('장소를 입력해주세요 : ')
if(len(dongname) <= 1):
	print('장소를 한 자 이상 입력해 주세요')
	exit()

url = 'https://maps.googleapis.com/maps/api/geocode/json?'

querystring = {'address': dongname, 'key': 'AIzaSyArHE1hXsBVVHAr1W1eUjLOp34W6hcybIU'}
req = Request(url + urlencode(querystring), headers= {'accept-language': 'ko-KR', 'User-Agent': 'Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'})

#키	값
# User-Agent	Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
urldata = urlopen(req)

html = urldata.read();
print(html.decode('utf-8'))

 

반응형

댓글