python + crawling + elasticsearch

언어/python2019. 2. 4. 19:58

뷰어
댓글로
이전글
다음글

2019 - 02 - 04 산출물

---------------------------

from elasticsearch import Elasticsearch

from json import loads, load

from urllib.parse import urlencode

import requests

from bs4 import BeautifulSoup

import re

from semi_proj.elasticSearchNode import Ela

class NMV:

def __init__(self):

self.__url = None

self.__path = None

self.__param = {"sel":None, "page":None, "date":None, "tg":None}

self.__bsobj = None

self.element = []

def paramSetting(self):

with open("./info.json", "r", encoding="utf-8") as f:

json_doc = loads(f.read())

print (json_doc)

self.__url = json_doc["url"]

self.__path = json_doc["path"]

self.__param["sel"] = json_doc["sel"]

self.__param["date"] = json_doc["date"]

self.__param["tg"] = json_doc["tg"]

f.close()

def indexCreate(self):

Ela.createIndex()

def urlRequests(self):

#page

for p in range(1, 4):

self.__param["page"] = p

param = urlencode(self.__param)

# https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=pnt&date=20190202&tg=18&page=1

requ = self.__url + self.__path + "?" +param

html = requests.get(requ)

if html.status_code == 200:

self.__bsobj= BeautifulSoup(html.text, "html.parser")

a_list = list(map(lambda x:x.attrs["href"], self.__bsobj.select("div.tit5 > a")))

for i in a_list:

# print (i)

# ex: => https://movie.naver.com/movie/bi/mi/basic.nhn?code=10200

sub_url = self.__url + i

html = requests.get(sub_url)

if html.status_code == 200:

self.__bsobj= BeautifulSoup(html.text, "html.parser")

info = self.__bsobj.select_one("div.mv_info")

try:

movie_name1 = info.select_one("h3.h_movie > a").string

except AttributeError as e:

print (e)

pass

else:

movie_name2 = re.sub("[\n\r\t]", "", str(info.select_one("strong.h_movie2").string))

# 개요

# #content > div.article > div.mv_info_area > div.mv_info > dl

tmp_summary = self.__bsobj.select_one("#content > div.article > div.mv_info_area > div.mv_info > dl > dd > p")

tmp_dir = {

"movie_name1": movie_name1,

"movie_name2": movie_name2,

"jangr":None,

"nation":None,

"minute":None,

}

for i in range(1, 5):

# ========================

jangr = {} # 장르

nation = {} # 제작 국가

minute = {} # 런타임

day = {} # 개봉일

# ========================

tmp = tmp_summary.select_one("span:nth-of-type({})".format(i))

if i != 3:

tmp = tmp.select("a")

tmp_list = list(map(lambda x:x.string, [x.string for x in tmp]))

tmp_list = list(map(lambda x: re.sub("[\n\r\t ]", "", x.string), tmp_list))

if i == 1:

jangr["jangr"] = tmp_list

tmp_dir["jangr"] = jangr["jangr"]

elif i == 2:

nation["nation"] = tmp_list

tmp_dir["nation"] = nation["nation"]

elif i == 4:

ttmp_list = []

for i in range(0, len(tmp_list), 2):

ttmp_list.append(tmp_list[i] + tmp_list[i+1 ])

day["day"] = ttmp_list

tmp_dir['day'] = day["day"]

else: # i == 3

tmp_list = list(map(lambda x:re.sub("[\n\r\t ]", "",x.string), tmp))

minute["minute"] = tmp_list

tmp_dir["minute"] = minute["minute"]

print (tmp_dir)

self.element.append(tmp_dir)

Ela.dataInsert(self.element)

class T(NMV):

pass

t = T()

# t.indexCreate()

t.paramSetting()

t.urlRequests()

-------------------------------

json 파일

{

"url": "https://movie.naver.com",

"path": "/movie/sdb/rank/rmovie.nhn",

"sel": "pnt",

"date":"20190202",

"tg":"18"

}

------------------------------------------

elasticsearch

from elasticsearch import Elasticsearch

import json

class Ela:

es = Elasticsearch(hosts="192.168.240.129", port=9200)

@classmethod

def createIndex(cls):

# ===============

# 인덱스 생성

# ===============

cls.es.indices.create(

index = "today19020402",

body = {

"settings": {

"number_of_shards": 5

}

)

@classmethod

def indexExists(cls, paramindex):

####

# 해당 index가 존재하는가?

####

bool = cls.es.indices.exists(index=paramindex)

print (bool)

@classmethod

def indexCount(cls):

print (cls.es.count(index="today19020402"))

@classmethod

def dataInsert(cls, element):

# ===============

# 데이터 삽입

# ===============

for n, i in enumerate(element):

res = cls.es.index(index="today19020402", doc_type="today", id=n + 1, body=i)

print(res)

Ela.indexExists("today190204012")

Ela.indexCount()

저작자표시 비영리 변경금지 (새창열림)

'언어 > python' 카테고리의 다른 글

python + 지하철 + 이미지 (0)	2019.02.24
pysimplegui (0)	2019.02.10
프로젝트 코드 일부분 (0)	2019.01.20
프로젝트 디렉토리 (0)	2019.01.13
project 일부 코드 (0)	2019.01.08

일	월	화	수	목	금	토
					1	2
3	4	5	6	7	8	9
10	11	12	13	14	15	16
17	18	19	20	21	22	23
24	25	26	27	28	29	30
31

길

python + crawling + elasticsearch

'언어 > python' 카테고리의 다른 글

최근에 올라온 글

최근에 달린 댓글

공지사항

글 보관함

링크

티스토리툴바