from csv to json convert + logstash
import csv
import json
import os
import shutil
import subprocess
import time
#
class ConvJson():
def __init__(self):
# 대상 디렉토리
self._targetPath = r""
# json 이동 디렉토리
self._jsonPath = "jsonDir"
# field_name
self._fieldName = ("a", "b", "c")
# logstash path
self._logstashRun = ""
self._targetConfDir = ""
def csvRead(self):
# 디렉토리 이동
os.chdir(self._targetPath)
# file list
fileList = os.listdir()
for f in fileList:
fileAbsPath = os.path.abspath(f)
fileName, fileExtension = os.path.splitext(fileAbsPath)
if fileExtension == ".csv":
try:
csvFile = open(fileAbsPath, "r", encoding="utf-8")
next(csvFile)
except FileNotFoundError as E:
print(E)
exit(1)
else:
""" ndjson 파일로 변환
"""
try:
jsonFile = open("{}.json".format(fileName), "w", encoding="utf-8")
except:
print("file error")
exit(1)
else:
reader = csv.DictReader(f=csvFile, fieldnames=self._fieldName, delimiter="|")
data = list(reader)
for x in range(0, len(data)):
# json.dump(data, fp=jsonFile)
if x != len(data)-1:
strData = json.dumps(data[x]) + "\n"
else:
strData = json.dumps(data[x])
jsonFile.write(strData)
jsonFile.close()
csvFile.close()
def jsonFileMov(self):
# 디렉토리 이동
os.chdir(self._targetPath)
# file list
fileList = os.listdir()
for f in fileList:
fileAbsPath = os.path.abspath(f)
fileName, fileExtension = os.path.splitext(fileAbsPath)
if fileExtension == ".json":
try:
shutil.move("{}.json".format(fileName), self._targetPath + "\\" + self._jsonPath)
except OSError as E:
print("파일 이동 에러")
print(E)
exit(1)
else:
print ("이동 성공")
def logstashInsert(self):
# 디렉토리 이동
os.chdir(self._targetPath + "\\" + self._jsonPath)
# file list
fileList = os.listdir()
for f in fileList:
fileAbsPath = os.path.abspath(f)
command = self._logstashRun + " -f " + self._targetConfDir + " < " + fileAbsPath
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, _ = p.communicate()
print(stdout)
print("indexing complate")
print("=======================================================")
time.sleep(5)
def main():
o = ConvJson()
o.csvRead()
o.jsonFileMov()
if __name__ == "__main__":
main()
'언어 > python' 카테고리의 다른 글
python으로 pdf 파일 read (0) | 2019.12.08 |
---|---|
백준 2108 (0) | 2019.12.08 |
네이버 기사 크롤링 => elasticsearch 적재 (0) | 2019.07.12 |
naver music 크롤링 + elasticsearch (0) | 2019.05.22 |
네이버 뉴스 크롤링 + 형태소 (0) | 2019.05.01 |
네이버 기사 크롤링 => elasticsearch 적재
##
# date
##
from elasticsearch import Elasticsearch
import requests as req
from selenium.webdriver import chrome
from urllib.parse import urlencode
import json
import yaml
from bs4 import BeautifulSoup
from time import localtime
import time
import os
import sys
# ========================================
class NaverNews:
def __init__(self):
self.elastiClient = NaverNews.elaInformation()
urlSettingObj = NaverNews.urlRequestSetting()
## url 정보
self.reqUrl = urlSettingObj.get("url")
## ex) fromat => 20190712
self.currTimeObj = NaverNews.getCurrTime()
self.urlInfo = {
"etcParams": urlSettingObj.get("etcParam"),
"page": None
}
""" Title 에 내가 원하는 단어가 있니??
"""
def isTrue(self):
## ====================================================================
# 경로 이동
## ====================================================================
os.chdir(r"C:\Users\ezfarm\PycharmProjects\ElasticSearchProj\htmlObj")
htmlPath = r"C:\Users\ezfarm\PycharmProjects\ElasticSearchProj\htmlObj"
for htmlF in os.listdir():
abstractPath = os.path.abspath(htmlF)
print (abstractPath)
### =============================
# html file read
### =============================
try:
htmlFileRead = open(abstractPath)
except FileNotFoundError as e:
print (e)
pass
else:
### =============================
# html file read
### =============================
bsObject = BeautifulSoup(htmlFileRead,"html.parser")
HEAD_LINE = bsObject.select("ul.type06_headline > li")
for h in HEAD_LINE:
try:
headline = h.select("dl > dt")[1]
except IndexError as e:
try:
headline = h.select_one("dl > dt")
except:
print ("요청 error")
pass
else:
responseObj = self.textPreprocessing(headline.a.string)
if responseObj["isTrue"]:
self.elasticInsertDocuments(responseObj["title"],
h.select_one("dl > dd > span.lede").string)
else:
responseObj = self.textPreprocessing(headline.a.string)
if responseObj["isTrue"]:
self.elasticInsertDocuments(responseObj["title"],
h.select_one("dl > dd > span.lede").string)
def textPreprocessing(self, txt):
tmp = str(txt).strip().replace("\n", "")
mark = {"title": tmp, "isTrue": False}
for i in ["김정은", "이명박", "미사일"]:
if i in tmp:
mark["isTrue"] = True
break
return mark
def elasticInsertDocuments(self, title, hObject):
documents = {
"title" : title,
"context" : hObject,
"cllctdt" : self.currTimeObj
}
try:
self.elastiClient.index (
index ="naver_headline_index", # 적재할 index
doc_type ="doc",
body =documents
)
except:
print ("적재 실패 !!!")
pass
else:
time.sleep(1.2)
print("elasticsearch insert success !!!")
print (documents)
def doRequests(self):
for n, p in enumerate(range(1, 95)):
self.urlInfo["page"] = str(p)
"""
mode=LSD&mid=sec&sid1=100&date=20190712&page=7
"""
paramsEtc = self.urlInfo["etcParams"] + "&" + \
"date=" + self.currTimeObj + "&" + \
"page=" + self.urlInfo["page"]
requestUrl = self.reqUrl + "?" + paramsEtc
try:
html = req.get(requestUrl)
except req.exceptions.RequestException as e:
print (e)
sys.exit(1)
else:
# print ("{} page 작업 중 ...".format(n+1))
# bsObject = BeautifulSoup(html.text, "html.parser")
htmlName = "html_file_{}.html".format(str(n+1))
htmlFile = open(r"C:\Users\ezfarm\PycharmProjects\ElasticSearchProj\htmlObj\{}".format(htmlName),
"w")
try:
htmlFile.write(html.text)
except:
print ("html file write error")
pass
else:
print ("{} 번째 데이터 파일 write success !!!".format(n+1))
htmlFile.close()
""" reuqest setting
"""
@classmethod
def urlRequestSetting(cls):
try:
f = open(r"C:\Users\ezfarm\PycharmProjects\ElasticSearchProj\conf\url.yml", "r", encoding="utf-8")
except FileNotFoundError as e:
print(e)
sys.exit(1)
else:
yDoc = yaml.load(f, Loader=yaml.Loader)
f.close() # memory 해제
return yDoc
""" 검색 날짜 설정
"""
@classmethod
def getCurrTime(cls):
currObjTime = time.strftime("%Y%m%d", localtime())
return currObjTime
""" elasticsearch server가 살아 있는지 확인
"""
@classmethod
def isAliveElastic(cls, elaAddress):
try:
req.get("http://" + elaAddress + ":9200")
except req.exceptions.RequestException as e:
""" server is die !!
"""
print(e)
sys.exit(1)
else:
print("elasticsearch server is alive !!!")
return Elasticsearch(host=elaAddress)
""" elasticsearch server address 정보 return
"""
@classmethod
def elaInformation(cls):
path = r"C:\Users\ezfarm\PycharmProjects\ElasticSearchProj\conf\elainfo.json"
try:
f = open(path, "r", encoding="utf-8")
except:
sys.exit(1)
else:
jsonDoc = json.load(f)
f.close()
elasticNode = NaverNews.isAliveElastic(jsonDoc.get("ela"))
return elasticNode
def main():
elanode = NaverNews()
elanode.isTrue()
#elanode.doRequests()
if __name__ == "__main__":
main()
'언어 > python' 카테고리의 다른 글
백준 2108 (0) | 2019.12.08 |
---|---|
from csv to json convert + logstash (0) | 2019.11.26 |
naver music 크롤링 + elasticsearch (0) | 2019.05.22 |
네이버 뉴스 크롤링 + 형태소 (0) | 2019.05.01 |
페이스북 - python (0) | 2019.04.24 |
naver music 크롤링 + elasticsearch
from time import localtime, strftime
from bs4 import BeautifulSoup
import requests
import json
from Ela.Elast import Elarv
class NMusic:
def __init__(self):
self.url = NMusic.getInformation()
def getUrl(self):
html = requests.get(self.url)
if html.status_code == 200:
bsObject = BeautifulSoup(html.text, "html.parser")
print("title : {}".format(bsObject.title.string))
top100 = bsObject.select_one("table.home_top100 > tbody")
for r in range(1, 11):
lst = top100.select_one("tr._tracklist_move._track_dsc.list{rank}".format(rank=r))
# --------------------------------------------------------------
rnk = lst.select_one("td.ranking > span.num") # - 순위
nme = lst.select_one("td.name > span.m_ell > a") # - 곡명
artist = lst.select_one("td._artist > span.m_ell > a._artist") # - 뮤지션
insrtDay= strftime("%Y%m%d", localtime()) # - 삽입 년도
d = {"rank" : rnk.string,
"name" : nme.string,
"artist" : artist.string,
"insertdate" : insrtDay}
Elarv.insertDocuments(d)
print ("적재 성공 !!!")
# print (insrtDay)
# --------------------------------------------------------------
#print ("{ranking} => {songname} : {artist}".format(ranking = rnk.string, songname = nme.string, artist = artist.string))
@classmethod
def getInformation(cls):
try:
f = open(r"C:\Users\junhyeon.kim\Desktop\StuEla\clw\info.json", "r", encoding="utf-8")
except FileNotFoundError as e:
print (e)
else:
jsonDoc = dict(json.load(f)).get("url")
f.close()
return jsonDoc
def main():
m = NMusic() # 객체 생성
m.getUrl()
if __name__ == "__main__":
main()
from elasticsearch import Elasticsearch
class Elarv:
@classmethod
def insertDocuments(cls, elements):
el = Elasticsearch(hosts="192.168.240.10")
el.index(index="nmusic", doc_type="doc", body=elements)
def main():
enode = Elarv()
if __name__ == "__main__":
main()
'언어 > python' 카테고리의 다른 글
from csv to json convert + logstash (0) | 2019.11.26 |
---|---|
네이버 기사 크롤링 => elasticsearch 적재 (0) | 2019.07.12 |
네이버 뉴스 크롤링 + 형태소 (0) | 2019.05.01 |
페이스북 - python (0) | 2019.04.24 |
python + outlook (0) | 2019.03.31 |
https://kin.naver.com/qna/detail.nhn?d1id=1&dirId=1040101&docId=327353769&mode=answer
# include <stdio.h>
# include <stdlib.h>
# include <math.h>
# define ERROR_ 1
typedef struct Triangle
{
double bottomLine; // 밑변
double heightLine; // 높이
double longLine;
}Tri;
// 동적할당 및 데이터 초기화 ___________
void _init_(Tri** tparam);
// 메모리 해제 ________________________
void _memoryFree_(Tri** tparam);
// 밑변, 높이 입력_____________________
void _numberInput_(Tri** tparam);
// 빗변의 길이 출력____________________
void _longLinePrintf_(Tri** tparam);
int main(void)
{
Tri* tnode = NULL;
_init_(&tnode);
_numberInput_(&tnode);
_longLinePrintf_(&tnode);
_memoryFree_(&tnode);
return 0;
} // end of main function
// 동적할당 및 데이터 초기화 ___________
void _init_(Tri** tparam)
{
(*tparam) = (Tri*)malloc(sizeof(Tri));
if ((*tparam) == NULL)
{
printf("malloc error");
exit(ERROR_);
}
else // (*tparam) != NULL
{
(*tparam)->bottomLine = 0.0;
(*tparam)->heightLine = 0.0;
(*tparam)->longLine = 0.0;
}
} // end of _init_ function
// 메모리 해제 ________________________
void _memoryFree_(Tri** tparam)
{
free((*tparam));
} // end of _memoryFree_ function
// 밑변, 높이 입력_____________________
void _numberInput_(Tri** tparam)
{
printf("밑변? ");
scanf_s("%lf", &(**tparam).bottomLine);
printf("높이? ");
scanf_s("%lf", &(**tparam).heightLine);
}
// 빗변의 길이 출력____________________
void _longLinePrintf_(Tri** tparam)
{
double c =
((**tparam).bottomLine * (**tparam).bottomLine) +
((**tparam).heightLine * (**tparam).heightLine);
(** tparam).longLine = sqrt(c);
printf("빗변의 길이: %lf\n", (** tparam).longLine);
}
'언어 > c언어' 카테고리의 다른 글
c언어 linkedlist (0) | 2019.12.31 |
---|---|
c언어 네이버 풀이 중첩 for문을 사용해서 3을 입력하면 (0) | 2019.05.06 |
네이버 풀이 (0) | 2018.12.01 |
네이버 문제 풀이 - 최대공약수 (0) | 2018.11.28 |
네이버 지식이 풀이 (0) | 2018.11.28 |
c언어 네이버 풀이 중첩 for문을 사용해서 3을 입력하면
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# define LENGTH 200
typedef struct Num {
char numList[LENGTH];
int n;
}Num, *ptrNum;
ptrNum retNode();
void init(Num**);
void writeNumber(Num**);
void testPrintf(Num**);
void resultPrintf(Num**);
int main(void) {
ptrNum pnode = retNode();
if (pnode == NULL) { exit(1); }
// 데이터 초기화
init(&pnode);
// 데이터 입력
writeNumber(&pnode);
// 테스트 출력
//testPrintf(&pnode);
// 결과 출력
resultPrintf(&pnode);
free(pnode);
return 0;
}
ptrNum retNode() {
ptrNum pnode = NULL;
pnode = (ptrNum)malloc(sizeof(Num));
return pnode;
} // end of retNode function
void init(Num** param) {
strcpy_s((*param)->numList, LENGTH ,"\0");
(*param)->n = 0x0;
} // end of init function
void writeNumber(Num** param) {
int num = 0;
int j = 0;
int i;
scanf_s("%d", &(*param)->n);
for (i = (*param)->n; i >= 1; i--, j++) {
*((*param)->numList + j) = i + '0';
}
*((*param)->numList + j) = '\0';
} // end of writeNumber function
void testPrintf(Num** param) {
printf("%s\n", (*param)->numList);
} // end of testPrintf function
void resultPrintf(Num** param) {
int i, j;
for (i = 0; i < (*param)->n; i++) {
for (j = 0; j <= i; j++) {
printf("%c", *((*param)->numList + j));
}
printf("\n");
}
} // end of resultPrintf function
'언어 > c언어' 카테고리의 다른 글
c언어 linkedlist (0) | 2019.12.31 |
---|---|
네이버 풀이 (0) | 2019.05.18 |
네이버 풀이 (0) | 2018.12.01 |
네이버 문제 풀이 - 최대공약수 (0) | 2018.11.28 |
네이버 지식이 풀이 (0) | 2018.11.28 |
네이버 뉴스 크롤링 + 형태소
from selenium import webdriver
from bs4 import BeautifulSoup
from konlpy.tag import Okt
import requests
import time
from openpyxl import Workbook
from Nature.Utils.Util import Util
##
# 2019-05-01
class News:
def __init__(self):
self.workBook = Workbook() # excel work 객체
self.url = "https://news.naver.com/"
self.chromeDriver = None
self.josaList = set()
#self.params = Util.getConfiguration()
def getUrl(self):
# Option-----------------------------------------------
option = webdriver.ChromeOptions()
option.add_argument("headless")
option.add_argument("window-size=1920x1080")
option.add_argument("disable-gpu")
# -----------------------------------------------------
self.chromeDriver = webdriver.Chrome(executable_path="C:\\Users\\junhyeon.kim\\Documents\\chrome_driver\\chromedriver.exe",
options =option)
self.chromeDriver.get(self.url)
self.chromeDriver.implicitly_wait(3)
# title 확인 -------------------------------------------
print (self.chromeDriver.title) ; time.sleep(2)
# 3 : 정치 / 4 : 경제 / 5 : 사회 / 6 : 생활/문화 / 7 : 세계 / 8 : it/과학
for p in range(3, 9):
# xx 항목으로 click
self.chromeDriver.find_element_by_xpath('//*[@id="lnb"]/ul/li['+ str(p) +']/a/span[1]').click()
print (" >>> {}".format(self.chromeDriver.title)) ; time.sleep(2)
bsObject = BeautifulSoup(self.chromeDriver.page_source, "html.parser")
cluster = bsObject.select("div.cluster > "
"div.cluster_group > "
"div.cluster_body > "
"ul.cluster_list > "
"li.cluster_item > "
"div.cluster_text")
for c in cluster:
t = c.select_one("a")
if t.string != None:
print ("title : {0} , requ : {1}".format(t.string, t.attrs))
html = requests.get(t.attrs["href"])
if html.status_code == 200:
bsObject = BeautifulSoup(html.text, "html.parser")
txt = bsObject.select_one("div#articleBodyContents")
# 가공
# 양쪽 공백 제거
# 개행 제거
resltText = str(txt.text).replace("\n", "")
resltText = resltText.replace("// flash 오류를 우회하기 위한 함수 추가function _flash_removeCallback() {}", "")
resltText = resltText.strip()
print (resltText)
self.detail(resltText)
print ("===========================")
print (self.josaList)
self.writeXl(self.josaList)
self.destroy()
def detail(self, text):
okt = Okt()
p = [x for x in okt.pos(text)]
s = self.removeWord(p)
self.josaList = self.josaList.union(s)
def removeWord(self, d):
"""
:param d:
:return: set ( 외래어, 조사, 동사, 부사 )
"""
r = set()
for i in d:
if i[1] == "Foreign" or \
i[1] == "Josa" or \
i[1] == "Verb" or \
i[1] == "Adjective" or \
i[1] == "Modifier":
r.add(i[0])
return r
# 엑셀에 데이터 import
def writeXl(self, wrdData):
workSheet = self.workBook.active
for n, w in enumerate(wrdData):
workSheet.cell(row=n+1, column=1).value = w
self.workBook.save(r"C:\Users\junhyeon.kim\Desktop\ezfarm\Nature\Result\stopWord.xlsx")
self.workBook.close()
def destroy(self):
if self.chromeDriver != None:
self.chromeDriver.close()
def main():
n = News()
n.getUrl()
if __name__ == "__main__":
main()
'언어 > python' 카테고리의 다른 글
네이버 기사 크롤링 => elasticsearch 적재 (0) | 2019.07.12 |
---|---|
naver music 크롤링 + elasticsearch (0) | 2019.05.22 |
페이스북 - python (0) | 2019.04.24 |
python + outlook (0) | 2019.03.31 |
selenium_ (0) | 2019.03.11 |
페이스북 - python
import os
from PIL import Image
from SIMSIM.Utils import Utils
class TestCode:
def __init__(self):
self.targetDirectory = Utils.getFile()
self.elements = {} # dictionary
def directorySearch(self):
# directory 이동
Utils.directoryMove(self.targetDirectory)
directoryList = os.listdir(os.path.abspath(os.getcwd()))
for f in directoryList:
mrk = f.split("-")[0]
if mrk not in self.elements.keys():
self.elements[mrk] = [os.path.abspath(f)]
else:
self.elements[mrk].append(os.path.abspath(f))
print (self.elements)
def imageSize(self):
for k, v in self.elements.items():
new_dir = os.getcwd() + "\\" + k + "_dir"
if not os.path.exists(new_dir):
os.mkdir(new_dir)
sizeDict = {"with": [], "height": []}
for x in v:
im = Image.open(x)
w,h = im.size
sizeDict["with"].append(w); sizeDict["height"].append(w)
height = max(sizeDict["height"])
# image resize
reSize = self.avgSize(sizeDict)
# save image
saveImageFile = Image.new("RGB", (reSize[0], reSize[1]*2), (255,255,255))
yOffset = 0
for x in v:
rIm = Image.open(x)
resizedImage = rIm.resize(reSize)
saveImageFile.paste(resizedImage, (0, yOffset))
yOffset += resizedImage.size[1]
os.chdir(new_dir)
saveImageFile.save("{}.jpg".format(k))
# 상위 경로로 이동
os.chdir("..")
def avgSize(self, size):
return min(size["with"]), min(size["height"])
def main():
tnode = TestCode() # 객체 생성성
tnode.directorySearch()
tnode.imageSize()
if __name__ == "__main__":
main()
'언어 > python' 카테고리의 다른 글
naver music 크롤링 + elasticsearch (0) | 2019.05.22 |
---|---|
네이버 뉴스 크롤링 + 형태소 (0) | 2019.05.01 |
python + outlook (0) | 2019.03.31 |
selenium_ (0) | 2019.03.11 |
2019년 3월 9일 ( 주말 프로젝트 ) (0) | 2019.03.09 |
elasticsearch java api : total index search
public void totalIndex() {
GetIndexRequest getIndexRequest = new GetIndexRequest().indices("*");
GetIndexResponse response = null;
try {
response = _highClient.indices().get(getIndexRequest, RequestOptions.DEFAULT);
String[] indices = response.getIndices();
for (int i = 0; i < indices.length; i++) {
System.out.println(indices[i]);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
'언어 > java' 카테고리의 다른 글
변수 [ Java의 정석 ] (0) | 2020.04.06 |
---|---|
elasticsearch + java + index 삭제 (0) | 2019.12.14 |
elasticsearch java api search (0) | 2019.03.13 |
크롤링 => json 파일로 적재 (0) | 2019.03.03 |
crawling + json (0) | 2019.02.13 |
python + outlook
import win32com.client as win32
import time
outlook = win32.Dispatch("Outlook.Application").GetNamespace("MAPI")
# 5 : 내가 보낸 메일
# 6 : 내가 받은 메일
inbox = outlook.GetDefaultFolder("6")
allBox = inbox.Items
# 몇 통의 메일이 있는지 확인
print (allBox.Count)
for msg in allBox:
# msg.Subject : 메일 제목
# SenderName : 보낸 사람 이름
# SenderEmailAddress : 보낸 사람 이메일
print ("To : {}|{} Text: {} ".format(msg.SenderName, msg.SenderEmailAddress, msg.Subject))
'언어 > python' 카테고리의 다른 글
네이버 뉴스 크롤링 + 형태소 (0) | 2019.05.01 |
---|---|
페이스북 - python (0) | 2019.04.24 |
selenium_ (0) | 2019.03.11 |
2019년 3월 9일 ( 주말 프로젝트 ) (0) | 2019.03.09 |
data crawling (0) | 2019.03.03 |
jinja + html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{{ name }}</title>
<style>
th#t01 {
background-color: #f1f1c1;
}
td {
text-align: left;
}
</style>
</head>
<body>
<table style="width:100%" border="2">
{% for e in data_dict %}
{% for key, value in e.items() %}
<tr>
<th id="t01"> {{ key }} </th>
<td>
{% if key == "movie_summary" %}
{% for k, v in dict(value).items() %}
<table>
<tr>
<th> {{ k }} </th>
<td> {{ v }} </td>
</tr>
</table>
{% endfor %}
{% else %}
{% if key == "movie_poster_path" %}
<img src= "{{ value }}">
{% else %}
{{ value }}
{% endif %}
{% endif %}
</td>
</tr>
{% endfor %}
{% endfor %}
</table>
</body>
</html>
'언어 > html' 카테고리의 다른 글
스터디 (html-> list) (0) | 2017.09.10 |
---|---|
<h1> </h1> (0) | 2016.05.24 |
테스트 (0) | 2016.05.10 |
작업중 (0) | 2016.02.28 |
study1 (0) | 2016.02.16 |