데이트를 위해 맛집 크롤링 중
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import xlsxwriter # 엑셀
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import time
import os
# =====================================================
class Crawling(object):
def __init__(self, search_data):
# 작업 위치
os.chdir(r'C:\Users\sleep\Desktop\date_info')
self.search_data = search_data
self.target_url = "https://www.naver.com/"
self.chrome_options = Options() # 객체 생성
self.chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(r'C:\Users\sleep\Desktop\chrom_driver\chromedriver.exe',
chrome_options=self.chrome_options)
# 엑셀___________________________________________
self.wb = None
self.ws = None
# ______________________________________________
self.food_info_list = list()
self.index_list = ['title', 'url', 'latitude','longitude']
self.width_list = [87.5, 47.7, 15.8, 15.8]
self.index_num = [2, 3, 4, 5]
# func (1)
def URLrequest(self):
self.driver.get("https://www.naver.com/")
assert "NAVER" in self.driver.title
print ("title : {}".format(self.driver.title))
self.driver.find_element_by_name('query').send_keys(self.search_data)
self.driver.find_element_by_id('search_btn').click()
# scroll bar
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
self.driver.find_element_by_xpath('//*[@id="main_pack"]/div[2]/div[2]/a').click()
page = 1
x = 10
while True:
print ("{} 페이지 작업 중 ==========================".format(page))
# scroll bar
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html = BeautifulSoup(self.driver.page_source, "html.parser")
time.sleep(3)
li_list = html.select('ul.type01 > li') # list
for i in li_list:
# =============================================
t_data = {"thum_name_url":None,
"title":None,
"url":None,
"page":page}
# =============================================
thumb_nail = i.select_one('div.review_thumb > a')
try:
thumb_nail_url = thumb_nail.attrs['href']
except AttributeError as e:
print (e)
else:
t_data['thum_name_url'] = thumb_nail_url
# ---------------------------------------------
title_list = i.select_one('div.review_content > a')
try:
title = title_list.attrs['title']
except AttributeError as e:
print (e)
else:
t_data['title'] = title
# ---------------------------------------------
try:
url = title_list.attrs['href']
except AttributeError as e:
print (e)
else:
t_data['url'] = url
# ---------------------------------------------
self.food_info_list.append(t_data)
print (t_data)
# =============================================
try:
self.driver.find_element_by_xpath('//*[@id="main_pack"]/div[3]/a[{}]'.format(x)).click()
except:
print ("page _end")
break
else:
x = 11
page += 1
time.sleep(1)
# func (2)
def XLwrite(self):
self.wb = xlsxwriter.Workbook(self.search_data+'_.xlsx')
self.cell_format = self.wb.add_format()
self.cell_format.set_bottom() # 테두리 : 바닥
self.cell_format.set_top() # 테두리 : 천장
self.cell_format.set_left() # 테두리 : 왼쪽
self.cell_format.set_right() # 테두리 : 오른쪽
self.ws = self.wb.add_worksheet()
# 셀 열너비
for i ,w in zip(self.index_num, self.width_list):
self.ws.set_column(i, i, w)
r = 3
c = 2
for i in self.index_list:
self.ws.write_string(row=r, col=c,
string=i,
cell_format=self.cell_format)
c = c + 1
r += 1 # 행
c = 2
for i in self.food_info_list:
for j in i.keys():
if j == 'title':
self.ws.write_string(row=r,col=c,
string=i['title'],
cell_format=self.cell_format)
c = c + 1
elif j == 'url':
self.ws.write_string(row=r, col=c,
string=i['url'],
cell_format=self.cell_format)
c = c + 1
c = 2
r += 1 # 행
self.wb.close()
print ("엑셀 적재 완료")
def main():
node = Crawling("여의도 데이트 음식점") # 객체 생성
node.URLrequest()
node.XLwrite()
if __name__ == "__main__":
main()
'python > request + bs4' 카테고리의 다른 글
python3 request (0) | 2018.03.11 |
---|
python3 request
import requests
# HTML 소스 가져오기
def HTMLsouceGet(p):
print (p.text)
# HTTP 헤더 정보
def HTTPheaderGet(p):
retV = p.headers
return retV
def HTTPstatusGet(p):
print (p.status_code)
def main():
# GET방식으로 요청
req = requests.get(url='http://www.kitri.re.kr/academy/it_education/job_status2009.web')
is_ok = req.ok
print ("is_ok => {result}".format(result = is_ok))
if req.ok:
HTTPstatusGet(req)
d = HTTPheaderGet(req)
for i in d.keys():
print (i, d[i], sep='=> ')
else:
print ("요청 에러")
if __name__ == "__main__":
main()
is_ok => True
200
Date=> Sun, 11 Mar 2018 06:17:49 GMT
Server=> Apache/2.2.15 (CentOS)
Set-Cookie=> JSESSIONID=B27FC1FD587B1967B6F36A6A8755CA45; Path=/; HttpOnly
Content-Language=> ko-KR
Connection=> close
Transfer-Encoding=> chunked
Content-Type=> text/html;charset=UTF-8
'python > request + bs4' 카테고리의 다른 글
데이트를 위해 맛집 크롤링 중 (0) | 2018.11.22 |
---|