python + crawling + elasticsearch
2019 - 02 - 04 산출물
---------------------------
from elasticsearch import Elasticsearch
from json import loads, load
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
import re
from semi_proj.elasticSearchNode import Ela
class NMV:
def __init__(self):
self.__url = None
self.__path = None
self.__param = {"sel":None, "page":None, "date":None, "tg":None}
self.__bsobj = None
self.element = []
def paramSetting(self):
with open("./info.json", "r", encoding="utf-8") as f:
json_doc = loads(f.read())
print (json_doc)
self.__url = json_doc["url"]
self.__path = json_doc["path"]
self.__param["sel"] = json_doc["sel"]
self.__param["date"] = json_doc["date"]
self.__param["tg"] = json_doc["tg"]
f.close()
def indexCreate(self):
Ela.createIndex()
def urlRequests(self):
#page
for p in range(1, 4):
self.__param["page"] = p
param = urlencode(self.__param)
# https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=pnt&date=20190202&tg=18&page=1
requ = self.__url + self.__path + "?" +param
html = requests.get(requ)
if html.status_code == 200:
self.__bsobj= BeautifulSoup(html.text, "html.parser")
a_list = list(map(lambda x:x.attrs["href"], self.__bsobj.select("div.tit5 > a")))
for i in a_list:
# print (i)
# ex: => https://movie.naver.com/movie/bi/mi/basic.nhn?code=10200
sub_url = self.__url + i
html = requests.get(sub_url)
if html.status_code == 200:
self.__bsobj= BeautifulSoup(html.text, "html.parser")
info = self.__bsobj.select_one("div.mv_info")
try:
movie_name1 = info.select_one("h3.h_movie > a").string
except AttributeError as e:
print (e)
pass
else:
movie_name2 = re.sub("[\n\r\t]", "", str(info.select_one("strong.h_movie2").string))
# 개요
# #content > div.article > div.mv_info_area > div.mv_info > dl
tmp_summary = self.__bsobj.select_one("#content > div.article > div.mv_info_area > div.mv_info > dl > dd > p")
tmp_dir = {
"movie_name1": movie_name1,
"movie_name2": movie_name2,
"jangr":None,
"nation":None,
"minute":None,
}
for i in range(1, 5):
# ========================
jangr = {} # 장르
nation = {} # 제작 국가
minute = {} # 런타임
day = {} # 개봉일
# ========================
tmp = tmp_summary.select_one("span:nth-of-type({})".format(i))
if i != 3:
tmp = tmp.select("a")
tmp_list = list(map(lambda x:x.string, [x.string for x in tmp]))
tmp_list = list(map(lambda x: re.sub("[\n\r\t ]", "", x.string), tmp_list))
if i == 1:
jangr["jangr"] = tmp_list
tmp_dir["jangr"] = jangr["jangr"]
elif i == 2:
nation["nation"] = tmp_list
tmp_dir["nation"] = nation["nation"]
elif i == 4:
ttmp_list = []
for i in range(0, len(tmp_list), 2):
ttmp_list.append(tmp_list[i] + tmp_list[i+1 ])
day["day"] = ttmp_list
tmp_dir['day'] = day["day"]
else: # i == 3
tmp_list = list(map(lambda x:re.sub("[\n\r\t ]", "",x.string), tmp))
minute["minute"] = tmp_list
tmp_dir["minute"] = minute["minute"]
print (tmp_dir)
self.element.append(tmp_dir)
Ela.dataInsert(self.element)
class T(NMV):
pass
t = T()
# t.indexCreate()
t.paramSetting()
t.urlRequests()
-------------------------------
json 파일
{
"url": "https://movie.naver.com",
"path": "/movie/sdb/rank/rmovie.nhn",
"sel": "pnt",
"date":"20190202",
"tg":"18"
}
------------------------------------------
elasticsearch
from elasticsearch import Elasticsearch
import json
class Ela:
es = Elasticsearch(hosts="192.168.240.129", port=9200)
@classmethod
def createIndex(cls):
# ===============
# 인덱스 생성
# ===============
cls.es.indices.create(
index = "today19020402",
body = {
"settings": {
"number_of_shards": 5
}
}
)
@classmethod
def indexExists(cls, paramindex):
####
# 해당 index가 존재하는가?
####
bool = cls.es.indices.exists(index=paramindex)
print (bool)
@classmethod
def indexCount(cls):
print (cls.es.count(index="today19020402"))
@classmethod
def dataInsert(cls, element):
# ===============
# 데이터 삽입
# ===============
for n, i in enumerate(element):
res = cls.es.index(index="today19020402", doc_type="today", id=n + 1, body=i)
print(res)
Ela.indexExists("today190204012")
Ela.indexCount()
'언어 > python' 카테고리의 다른 글
python + 지하철 + 이미지 (0) | 2019.02.24 |
---|---|
pysimplegui (0) | 2019.02.10 |
프로젝트 코드 일부분 (0) | 2019.01.20 |
프로젝트 디렉토리 (0) | 2019.01.13 |
project 일부 코드 (0) | 2019.01.08 |