네이버 뉴스 크롤링 + 형태소

언어/python2019. 5. 1. 14:20

뷰어
댓글로
이전글
다음글

from selenium import webdriver
from bs4 import BeautifulSoup
from konlpy.tag import Okt
import requests
import time
from openpyxl import Workbook
from Nature.Utils.Util import Util
##
# 2019-05-01
class News:

    def __init__(self):
        self.workBook = Workbook()  # excel work 객체
        self.url  = "https://news.naver.com/"
        self.chromeDriver = None
        self.josaList = set()
        #self.params = Util.getConfiguration()

    def getUrl(self):

        # Option-----------------------------------------------
        option = webdriver.ChromeOptions()
        option.add_argument("headless")
        option.add_argument("window-size=1920x1080")
        option.add_argument("disable-gpu")

        # -----------------------------------------------------
        self.chromeDriver = webdriver.Chrome(executable_path="C:\\Users\\junhyeon.kim\\Documents\\chrome_driver\\chromedriver.exe",
                                             options =option)

        self.chromeDriver.get(self.url)
        self.chromeDriver.implicitly_wait(3)

        # title 확인 -------------------------------------------
        print (self.chromeDriver.title) ; time.sleep(2)

        # 3 : 정치 / 4 : 경제 / 5 : 사회 / 6 : 생활/문화  / 7 : 세계  / 8 : it/과학
        for p in range(3, 9):

            # xx 항목으로 click
            self.chromeDriver.find_element_by_xpath('//*[@id="lnb"]/ul/li['+ str(p) +']/a/span[1]').click()
            print (" >>> {}".format(self.chromeDriver.title)) ; time.sleep(2)

            bsObject = BeautifulSoup(self.chromeDriver.page_source, "html.parser")

            cluster = bsObject.select("div.cluster > "
                                      "div.cluster_group > "
                                      "div.cluster_body > "
                                      "ul.cluster_list > "
                                      "li.cluster_item > "
                                      "div.cluster_text")
            for c in cluster:
                t = c.select_one("a")
                if t.string != None:
                    print ("title : {0} , requ : {1}".format(t.string, t.attrs))
                    html = requests.get(t.attrs["href"])

                    if html.status_code == 200:
                        bsObject = BeautifulSoup(html.text, "html.parser")
                        txt = bsObject.select_one("div#articleBodyContents")
                        # 가공
                        # 양쪽 공백 제거
                        # 개행 제거
                        resltText = str(txt.text).replace("\n", "")
                        resltText = resltText.replace("// flash 오류를 우회하기 위한 함수 추가function _flash_removeCallback() {}", "")
                        resltText = resltText.strip()
                        print (resltText)
                        self.detail(resltText)

                        print ("===========================")

        print (self.josaList)
        self.writeXl(self.josaList)
        self.destroy()

    def detail(self, text):

        okt = Okt()
        p = [x for x in okt.pos(text)]
        s = self.removeWord(p)
        self.josaList = self.josaList.union(s)

    def removeWord(self, d):
        """

        :param d:
        :return: set ( 외래어, 조사, 동사, 부사 )
        """
        r = set()
        for i in d:
            if i[1] == "Foreign" or \
               i[1] == "Josa"    or \
               i[1] == "Verb"    or \
               i[1] == "Adjective" or \
               i[1] == "Modifier":
                r.add(i[0])
        return r

    # 엑셀에 데이터 import
    def writeXl(self, wrdData):

        workSheet = self.workBook.active

        for n, w in enumerate(wrdData):
            workSheet.cell(row=n+1, column=1).value = w

        self.workBook.save(r"C:\Users\junhyeon.kim\Desktop\ezfarm\Nature\Result\stopWord.xlsx")
        self.workBook.close()

    def destroy(self):

        if self.chromeDriver != None:
            self.chromeDriver.close()

def main():
    n = News()
    n.getUrl()
if __name__ == "__main__":
    main()

'언어 > python' 카테고리의 다른 글

네이버 기사 크롤링 => elasticsearch 적재 (0)	2019.07.12
naver music 크롤링 + elasticsearch (0)	2019.05.22
페이스북 - python (0)	2019.04.24
python + outlook (0)	2019.03.31
selenium_ (0)	2019.03.11

일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31

길

네이버 뉴스 크롤링 + 형태소

'언어 > python' 카테고리의 다른 글

최근에 올라온 글

최근에 달린 댓글

공지사항

글 보관함

링크

티스토리툴바