python + elasticsearch
'''
문화재 검색 목록/ 문화재 상세 검색
@ 작성자 : 김준현
@ 작성일 :
@ url : https://www.cha.go.kr/html/HtmlPage.do?pg=/publicinfo/pbinfo3_0202.jsp&mn=NS_04_04_02
'''
from urllib.parse import urlencode
import requests
import os
import time
import sys
from COMM.UrlInit import UrlInit
from COMM.DocParcing import DocParcing
class ClastList:
def __init__(self):
self.reqUrlList = UrlInit.urlReturn(cat1="CH", cat2="ClastList", cat3="url")
self.reqUrlDetail = UrlInit.urlReturn(cat1="CH", cat2="ClastDetail", cat3="url")
self.params = {"pageUnit":100, "pageIndex":0}
self.clastListElements = [] # 문화재 검색 목록
self.clastDetailElements = [] # 문화재 상세 검색 목록
def doRequestsList(self):
fetchCount = 0
while True:
try:
fetchCount += self.params['pageUnit']
self.params["pageIndex"] += 1
time.sleep(1)
html = requests.get(self.reqUrlList + UrlInit.urlEncoding(self.params))
except requests.ConnectionError as e:
print ("error message : " + e)
sys.exit(1)
else:
if str(html.status_code)[0] == "2":
xmlTree = DocParcing.respoToXML(html.text)
totalCnt = int(xmlTree.find("totalCnt").text, base=10)
print (totalCnt, fetchCount)
for c in xmlTree.findall('item'):
element = {"ccmaName": c.find("ccmaName").text,
"ccbaMnm1": c.find("ccbaMnm1").text,
"ccbaMnm2": c.find("ccbaMnm2").text,
"ccbaCtcdNm":c.find("ccbaCtcdNm").text,
"ccsiName": c.find("ccsiName").text,
"ccbaAdmin": c.find("ccbaAdmin").text,
"ccbaKdcd": c.find("ccbaKdcd").text, #문화재검색 상세
"ccbaCtcd": c.find("ccbaCtcd").text, #문화재검색 상세
"ccbaAsno": c.find("ccbaAsno").text, #문화재검색 상세
"ccbaCncl": c.find("ccbaCncl").text,
"ccbaCpno": c.find("ccbaCpno").text,
"longitude": c.find("longitude").text,
"latitude": c.find("latitude").text,
"cllcttime": time.strftime("%Y%m%d")}
if None not in element.values():
self.clastListElements.append(element)
print (element)
else:
# 응답 코드가 : 3xx, 4xx, 5xx 인 경우
break
if fetchCount > totalCnt:
break
# self.doRequestsDetail()
DocParcing.doJsonFile(html_doc=self.clastListElements, json_file_path="./JSON/ClastList.json")
# DocParcing.doJsonFile(html_doc=self.clastDetailElements, json_file_path="./JSON/ClastDetail.json")
def doRequestsDetail(self):
for i in self.clastListElements:
tmp = {"ccbaKdcd" : i["ccbaKdcd"], "ccbaCtcd" : i["ccbaCtcd"], "ccbaAsno" : i["ccbaAsno"]}
try:
time.sleep(0.5)
html = requests.get(self.reqUrlDetail + UrlInit.urlEncoding(tmp))
except requests.ConnectionError as e:
print ("error message : " + e)
sys.exit(1)
else:
if str(html.status_code)[0] == "2":
xmlTree = DocParcing.respoToXML(html.text)
for c in xmlTree.findall('item'):
element = {"ccmaName": c.find("ccmaName").text,
"ccbaMnm1": c.find("ccbaMnm1").text,
"ccbaMnm2": c.find("ccbaMnm2").text,
"gcodeName": c.find("gcodeName").text,
"bcodeName": c.find("bcodeName").text,
"mcodeName": c.find("mcodeName").text,
"scodeName": c.find("scodeName").text,
"ccbaQuan": c.find("ccbaQuan").text,
"ccbaAsdt": c.find("ccbaAsdt").text,
"ccbaCtcdNm": c.find("ccbaCtcdNm").text,
"ccsiName": c.find("ccsiName").text,
"ccceName": c.find("ccceName").text,
"ccbaPoss": c.find("ccbaPoss").text,
"ccbaAdmin": c.find("ccbaAdmin").text,
"ccbaCncl": c.find("ccbaCncl").text,
"ccbaCndt": c.find("ccbaCndt").text,
"imageUrl": c.find("imageUrl").text,
"content": c.find("content").text,}
print (element)
self.clastDetailElements.append(element)
def main():
clastList = ClastList()
clastList.doRequestsList()
if __name__ == "__main__":
main()
======================================================================================================
from elasticsearch import Elasticsearch, helpers
import json
from COMM import Elastic
class ClastListElastic:
def __init__(self):
self.es = Elastic.Elastic.srvReturn()
def indexCreateList(self):
# 목화재 검색 목록
self.es.indices.create(
index="tb_frip_clastlist",
body= {
"settings": {
"number_of_shards": 5
},
"filter": {
"mutate": {
"remove_field":["@version", "@timestamp"]
}
},
"mappings":{
"doc":{
"properties":{
"ccmaName" : {"type": "text"},
"ccbaMnm1" : {"type": "text"},
"ccbaMnm2" : {"type": "text"},
"ccbaCtcdNm": {"type": "text"},
"ccsiName" : {"type": "text"},
"ccbaAdmin" : {"type": "text"},
"ccbaKdcd" : {"type": "text"},
"ccbaCtcd" : {"type": "text"},
"ccbaAsno" : {"type": "text"},
"ccbaCncl" : {"type": "text"},
"ccbaCpno" : {"type": "text"},
"longitude" : {"type": "text"},
"latitude" : {"type": "text"},
"cllcttime" : {"type": "text"},
}
}
}
}
)
def indexCreateDetail(self):
# 문화재 상세목록
self.es.indices.create(
index="tb_frip_clastDetail",
body={
"settings": {
"number_of_shards": 5
},
"filter": {
"mutate": {
"remove_field": ["@version", "@timestamp"]
}
},
"mappings": {
"doc": {
"properties": {
"ccmaName" : {"type": "text"},
"ccbaMnm1" : {"type": "text"},
"ccbaMnm2" : {"type": "text"},
"gcodeName" : {"type": "text"},
"bcodeName" : {"type": "text"},
"mcodeName" : {"type": "text"},
"scodeName" : {"type": "text"},
"ccbaQuan" : {"type": "text"},
"ccbaAsdt" : {"type": "text"},
"ccbaCtcdNm": {"type": "text"},
"ccsiName" : {"type": "text"},
"ccceName" : {"type": "text"},
"ccbaPoss" : {"type": "text"},
"ccbaAdmin" : {"type": "text"},
"ccbaCncl" : {"type": "text"},
"ccbaCndt" : {"type": "text"},
"imageUrl" : {"type": "text"},
"content" : {"type": "text"},
}
}
}
}
)
def jsonFileInsertList(self):
# json file 데이터 적재
with open("./JSON/ClastList.json", "r", encoding="utf-8") as jsonfile:
doc = json.load(jsonfile)
jsonfile.close()
for n, p in enumerate(doc):
print (p)
self.es.index(index="tb_frip_clastlist", doc_type="doc", id=n+1, body=p)
'ELK > elasticsearch' 카테고리의 다른 글
python + elasticsearch (0) | 2019.02.21 |
---|---|
python + elasticsearch + 현재 정리 중 (0) | 2019.02.19 |
python + elasticsearch + 조회/삽입/생성 (0) | 2019.02.03 |
search java api (0) | 2019.01.29 |
python + elasticsearch api (0) | 2019.01.29 |