네이버 뉴스 크롤링 + 형태소
from selenium import webdriver
from bs4 import BeautifulSoup
from konlpy.tag import Okt
import requests
import time
from openpyxl import Workbook
from Nature.Utils.Util import Util
##
# 2019-05-01
class News:
def __init__(self):
self.workBook = Workbook() # excel work 객체
self.url = "https://news.naver.com/"
self.chromeDriver = None
self.josaList = set()
#self.params = Util.getConfiguration()
def getUrl(self):
# Option-----------------------------------------------
option = webdriver.ChromeOptions()
option.add_argument("headless")
option.add_argument("window-size=1920x1080")
option.add_argument("disable-gpu")
# -----------------------------------------------------
self.chromeDriver = webdriver.Chrome(executable_path="C:\\Users\\junhyeon.kim\\Documents\\chrome_driver\\chromedriver.exe",
options =option)
self.chromeDriver.get(self.url)
self.chromeDriver.implicitly_wait(3)
# title 확인 -------------------------------------------
print (self.chromeDriver.title) ; time.sleep(2)
# 3 : 정치 / 4 : 경제 / 5 : 사회 / 6 : 생활/문화 / 7 : 세계 / 8 : it/과학
for p in range(3, 9):
# xx 항목으로 click
self.chromeDriver.find_element_by_xpath('//*[@id="lnb"]/ul/li['+ str(p) +']/a/span[1]').click()
print (" >>> {}".format(self.chromeDriver.title)) ; time.sleep(2)
bsObject = BeautifulSoup(self.chromeDriver.page_source, "html.parser")
cluster = bsObject.select("div.cluster > "
"div.cluster_group > "
"div.cluster_body > "
"ul.cluster_list > "
"li.cluster_item > "
"div.cluster_text")
for c in cluster:
t = c.select_one("a")
if t.string != None:
print ("title : {0} , requ : {1}".format(t.string, t.attrs))
html = requests.get(t.attrs["href"])
if html.status_code == 200:
bsObject = BeautifulSoup(html.text, "html.parser")
txt = bsObject.select_one("div#articleBodyContents")
# 가공
# 양쪽 공백 제거
# 개행 제거
resltText = str(txt.text).replace("\n", "")
resltText = resltText.replace("// flash 오류를 우회하기 위한 함수 추가function _flash_removeCallback() {}", "")
resltText = resltText.strip()
print (resltText)
self.detail(resltText)
print ("===========================")
print (self.josaList)
self.writeXl(self.josaList)
self.destroy()
def detail(self, text):
okt = Okt()
p = [x for x in okt.pos(text)]
s = self.removeWord(p)
self.josaList = self.josaList.union(s)
def removeWord(self, d):
"""
:param d:
:return: set ( 외래어, 조사, 동사, 부사 )
"""
r = set()
for i in d:
if i[1] == "Foreign" or \
i[1] == "Josa" or \
i[1] == "Verb" or \
i[1] == "Adjective" or \
i[1] == "Modifier":
r.add(i[0])
return r
# 엑셀에 데이터 import
def writeXl(self, wrdData):
workSheet = self.workBook.active
for n, w in enumerate(wrdData):
workSheet.cell(row=n+1, column=1).value = w
self.workBook.save(r"C:\Users\junhyeon.kim\Desktop\ezfarm\Nature\Result\stopWord.xlsx")
self.workBook.close()
def destroy(self):
if self.chromeDriver != None:
self.chromeDriver.close()
def main():
n = News()
n.getUrl()
if __name__ == "__main__":
main()
'언어 > python' 카테고리의 다른 글
네이버 기사 크롤링 => elasticsearch 적재 (0) | 2019.07.12 |
---|---|
naver music 크롤링 + elasticsearch (0) | 2019.05.22 |
페이스북 - python (0) | 2019.04.24 |
python + outlook (0) | 2019.03.31 |
selenium_ (0) | 2019.03.11 |