from pdflib import Document
import os
import base64
from ela_dir.Ela import Ela
class PDFObj():
def __init__(self):
#Ela.__init__(self)
self._targetPath="./pdf_dir"
def dirSearch(self):
os.chdir(self._targetPath)
cur = os.listdir()
for f in cur:
fname, fext = os.path.splitext(f)
if fext == ".pdf":
doc = Document(f)
print(doc.metadata)
for c, p in enumerate(doc):
strData = " ".join(p.lines).strip()
#encodedBytes = base64.b64encode(strData.encode("utf-8"))
#encodedStr = str(encodedBytes, "utf-8")
#e = {"page_" : c+1,
# "data_" : encodedStr}
e = {"page_" : c+1, "data_": strData}
print(e)
if __name__ == "__main__":
o = PDFObj()
o.dirSearch()
'언어 > python' 카테고리의 다른 글
네이버 python 지식인 답변 (0) | 2020.06.06 |
---|---|
21대 국회의원 선거 크롤링 (0) | 2020.04.15 |
python으로 pdf 파일 read (0) | 2019.12.08 |
백준 2108 (0) | 2019.12.08 |
from csv to json convert + logstash (0) | 2019.11.26 |
elasticsearch + java + index 삭제
static public void indexRemoveQuery(RestHighLevelClient param, String targetIndex) {
DeleteIndexRequest deleteIndexRequest = new DeleteIndexRequest(targetIndex);
try {
param.indices().delete(deleteIndexRequest, RequestOptions.DEFAULT);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(targetIndex +" : index remove success");
}
'언어 > java' 카테고리의 다른 글
정올 587번 문제 (0) | 2020.04.06 |
---|---|
변수 [ Java의 정석 ] (0) | 2020.04.06 |
elasticsearch java api : total index search (0) | 2019.04.22 |
elasticsearch java api search (0) | 2019.03.13 |
크롤링 => json 파일로 적재 (0) | 2019.03.03 |
logstash + app-search
input {
file {
start_position => "beginning"
path => ["/opt/logstash-7.2.0/conf/test.json"]
sincedb_path => "/dev/null"
codec => "json"
}
}
filter {
mutate {
remove_field => ["path", "@version", "@timestamp", "host"]
}
}
output {
stdout {}
elastic_app_search {
api_key => "private-fobwwhsirra3cgner79khnhi"
engine => "kimjunhyeon"
url => "http://172.16.40.139:3002"
path => "/api/as/v1/"
id => "test_01"
}
}
'ELK > logstash' 카테고리의 다른 글
logstash file json absolute path (0) | 2020.06.17 |
---|---|
logstash input-plugin (elasticsearch) (0) | 2020.05.12 |
logstash plugin install (0) | 2020.04.15 |
logstash + python (0) | 2019.01.12 |
logstash - mysql - elasticsearch 연동 (0) | 2019.01.06 |
python으로 pdf 파일 read
from pdflib import Document
import os
import base64
class PDFObj():
def __init__(self):
self._targetPath="./pdf_dir"
def dirSearch(self):
os.chdir(self._targetPath)
cur = os.listdir()
for f in cur:
fname, fext = os.path.splitext(f)
if fext == ".pdf":
doc = Document(f)
print(doc.metadata)
for c, p in enumerate(doc):
print("{} ========================".format(p))
strData = " ".join(p.lines).strip()
encodedBytes = base64.b64encode(strData.encode("utf-8"))
encodedStr = str(encodedBytes, "utf-8")
print(encodedStr)
print(strData)
if c == 3:
exit(1)
if __name__ == "__main__":
o = PDFObj()
o.dirSearch()
테스트 환경
=> ubuntu 18.4
=> interpreter 3.6
'언어 > python' 카테고리의 다른 글
21대 국회의원 선거 크롤링 (0) | 2020.04.15 |
---|---|
pdf 변환 (0) | 2019.12.18 |
백준 2108 (0) | 2019.12.08 |
from csv to json convert + logstash (0) | 2019.11.26 |
네이버 기사 크롤링 => elasticsearch 적재 (0) | 2019.07.12 |
from operator import itemgetter
from collections import Counter
import sys
class Q_2108():
def __init__(self):
self._numCnt = 0
self._numList = list()
self._totalSum = 0
def numSetting(self):
self._numCnt = int(sys.stdin.readline())
for _ in range(self._numCnt):
e = int(sys.stdin.readline())
self._totalSum += e
self._numList.append(e)
# 데이터 정렬
self._numList.sort()
def arithMean(self):
avr = round((float(self._totalSum)/len(self._numList)))
print(avr)
def middleNumber(self):
print(self._numList[int(len(self._numList)/2)])
def largeCnt(self):
modeDict = Counter(self._numList)
modes = modeDict.most_common()
if len(self._numList) > 1:
if modes[0][1] == modes[1][1]:
mod = modes[1][0]
else:
mod = modes[0][0]
else:
mod = modes[0][0]
print(mod)
# keyNum = set(self._numList)
# numCnt = [(k,self._numList.count(k)) for k in keyNum]
#
# # value값을 기준으로 정렬
# numCnt.sort(key= lambda e:(e[1], e[0]), reverse=True)
#
# # list는 순서를 보존한다.
# rawV = [i[1] for i in numCnt]
# largeNum = rawV[0]
# largeCnt = rawV.count(largeNum)
# largeArr = [i[0] for i in numCnt[:largeCnt]]
#
# if len(largeArr) == 1:
# print(largeArr[0])
#
# elif len(largeArr) > 1:
# if len(largeArr) == 2:
# print(largeArr[1])
# else:
# print(largeArr[len(largeArr)-2])
def numRange(self):
print(self._numList[len(self._numList)-1] - self._numList[0])
if __name__ == "__main__":
q = Q_2108()
q.numSetting()
q.arithMean()
q.middleNumber()
q.largeCnt()
q.numRange()
'언어 > python' 카테고리의 다른 글
pdf 변환 (0) | 2019.12.18 |
---|---|
python으로 pdf 파일 read (0) | 2019.12.08 |
from csv to json convert + logstash (0) | 2019.11.26 |
네이버 기사 크롤링 => elasticsearch 적재 (0) | 2019.07.12 |
naver music 크롤링 + elasticsearch (0) | 2019.05.22 |
Elasticsearch + python + pipeline
from ELa import Ela
class Client:
def __init__(self):
self._elaClinet = Ela.retElanode()
self._targetDir = ""
def insrtDoc(self):
d = {"data":"aGVsbG8="}
self._elaClinet.index(index="x_index", pipeline="attachment", body=d)
o = Client()
o.insrtDoc()
'ELK > elasticsearch' 카테고리의 다른 글
nginx setting (0) | 2019.12.19 |
---|---|
python-appsearch (0) | 2019.12.19 |
네이버 무비 정보 크롤링 => logstash data pipe_line => kibana 시각화 (0) | 2019.11.10 |
python + elasticsearch : csv => bulk json 변환 (0) | 2019.10.23 |
elasticsearch SSL 적용 connect code + python (0) | 2019.10.22 |
from csv to json convert + logstash
import csv
import json
import os
import shutil
import subprocess
import time
#
class ConvJson():
def __init__(self):
# 대상 디렉토리
self._targetPath = r""
# json 이동 디렉토리
self._jsonPath = "jsonDir"
# field_name
self._fieldName = ("a", "b", "c")
# logstash path
self._logstashRun = ""
self._targetConfDir = ""
def csvRead(self):
# 디렉토리 이동
os.chdir(self._targetPath)
# file list
fileList = os.listdir()
for f in fileList:
fileAbsPath = os.path.abspath(f)
fileName, fileExtension = os.path.splitext(fileAbsPath)
if fileExtension == ".csv":
try:
csvFile = open(fileAbsPath, "r", encoding="utf-8")
next(csvFile)
except FileNotFoundError as E:
print(E)
exit(1)
else:
""" ndjson 파일로 변환
"""
try:
jsonFile = open("{}.json".format(fileName), "w", encoding="utf-8")
except:
print("file error")
exit(1)
else:
reader = csv.DictReader(f=csvFile, fieldnames=self._fieldName, delimiter="|")
data = list(reader)
for x in range(0, len(data)):
# json.dump(data, fp=jsonFile)
if x != len(data)-1:
strData = json.dumps(data[x]) + "\n"
else:
strData = json.dumps(data[x])
jsonFile.write(strData)
jsonFile.close()
csvFile.close()
def jsonFileMov(self):
# 디렉토리 이동
os.chdir(self._targetPath)
# file list
fileList = os.listdir()
for f in fileList:
fileAbsPath = os.path.abspath(f)
fileName, fileExtension = os.path.splitext(fileAbsPath)
if fileExtension == ".json":
try:
shutil.move("{}.json".format(fileName), self._targetPath + "\\" + self._jsonPath)
except OSError as E:
print("파일 이동 에러")
print(E)
exit(1)
else:
print ("이동 성공")
def logstashInsert(self):
# 디렉토리 이동
os.chdir(self._targetPath + "\\" + self._jsonPath)
# file list
fileList = os.listdir()
for f in fileList:
fileAbsPath = os.path.abspath(f)
command = self._logstashRun + " -f " + self._targetConfDir + " < " + fileAbsPath
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, _ = p.communicate()
print(stdout)
print("indexing complate")
print("=======================================================")
time.sleep(5)
def main():
o = ConvJson()
o.csvRead()
o.jsonFileMov()
if __name__ == "__main__":
main()
'언어 > python' 카테고리의 다른 글
python으로 pdf 파일 read (0) | 2019.12.08 |
---|---|
백준 2108 (0) | 2019.12.08 |
네이버 기사 크롤링 => elasticsearch 적재 (0) | 2019.07.12 |
naver music 크롤링 + elasticsearch (0) | 2019.05.22 |
네이버 뉴스 크롤링 + 형태소 (0) | 2019.05.01 |
네이버 무비 정보 크롤링 => logstash data pipe_line => kibana 시각화
input {
stdin { }
}
filter {
json {
source => "message"
}
mutate {
remove_field => ["@version", "host", "message", "path"]
}
}
output {
stdout { codec => rubydebug }
elasticsearch {
hosts => ["http://192.168.42.136:9200"]
index => nvr_movie
}
}
logstash 코드
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import json
from Utils import Utils
#
# author : JunHyeon.Kim
# date : 20191110
# ver : 0.1
# naver move data cwling
#
#
class MV:
def __init__(self):
INFO_ = Utils.getSetting()
self._cllTime = Utils.getCllTime()
self._category = Utils.getCategory()
self._totalData = list()
self._urlInfo = {"url" : INFO_["url"], "path": INFO_["path"], "time": self._cllTime}
def urlRequests(self):
for i in self._category["category"]:
urlArgs_ = urlencode(
{
"sel" : "cnt",
"date": self._urlInfo["time"],
"tg" : i
}
)
print (urlArgs_, self._category["category"][i])
requestUrl_ = self._urlInfo["url"] + self._urlInfo["path"] +"?"+ urlArgs_
try:
htmlObj = requests.get(requestUrl_)
except requests.exceptions.ConnectionError as E:
print (E)
exit(1)
else:
if htmlObj.status_code == 200:
bsObj = BeautifulSoup(htmlObj.text, "html.parser")
titles = bsObj.select("td.title > div.tit3 > a")
with open("./nvr_movie_"+ self._cllTime +"_.json", "a", encoding="utf-8") as f:
for c, t in enumerate(titles):
d = {
"title" : t.attrs["title"],
"rank" : c+1,
"clltime": self._cllTime,
"genr" : self._category["category"][i] }
f.write(json.dumps(d ,ensure_ascii=False) + "\n")
f.close()
def main():
mvObj = MV()
mvObj.urlRequests()
if __name__ == "__main__":
main()
'ELK > elasticsearch' 카테고리의 다른 글
python-appsearch (0) | 2019.12.19 |
---|---|
Elasticsearch + python + pipeline (0) | 2019.12.02 |
python + elasticsearch : csv => bulk json 변환 (0) | 2019.10.23 |
elasticsearch SSL 적용 connect code + python (0) | 2019.10.22 |
logstash_01 / json (0) | 2019.10.19 |
matplotlib => line_chart + excel
import pandas as pd
import matplotlib.pyplot as plt
class LineChart:
def __init__(self):
self._xlsx = LineChart.getConfig()
self._xvalues = list()
self._yvalues = list()
def doLine(self):
for i in self._xlsx.index:
rowData = self._xlsx["날짜"][i], self._xlsx["방문자수"][i]
self._xvalues.append(rowData[0])
self._yvalues.append(rowData[1])
df = pd.DataFrame({"xvalues" : self._xvalues, "yvalues" : self._yvalues})
# plot
plt.plot("xvalues", "yvalues", data=df, color="g")
plt.xlabel("year")
plt.ylabel("count")
plt.title("tistory my count")
plt.show()
@classmethod
def getConfig(cls):
PATH = "/home/kim/Desktop/PY.dir/stu_01.dir/KimJH_uv.xlsx"
SHEETNAME = "uv"
xlxsObj = pd.read_excel(PATH, sheet_name=SHEETNAME)
return xlxsObj
def main():
obj = LineChart()
obj.doLine()
if __name__ == "__main__":
main()
'python > matplolib' 카테고리의 다른 글
matplotlib => pie_chart + xlsx (0) | 2019.11.02 |
---|
matplotlib => pie_chart + xlsx
import pandas as pd
import matplotlib.pyplot as plt
class PIE:
def __init__(self):
self._xlsx = PIE.getConfig()
self._category = {"남자" : 0, "여자" : 0}
def doPie(self):
for i in self._xlsx.index:
rowData = self._xlsx["성별"][i], self._xlsx["방문자수"][i]
self._category[rowData[0]] += rowData[1]
category = ["man", "woman"]
categorySize = [self._category["남자"], self._category["여자"]]
categoryColors = ["yellow", "green"]
categoryExplodes = (0.1, 0)
plt.pie(categorySize,
explode = categoryExplodes,
labels = category,
colors = categoryColors,
autopct = '%1.2f%%',
shadow = True,
startangle = 90,
textprops = {"fontsize" : 14}) # text font size
plt.axis("equal")
plt.show()
@classmethod
def getConfig(cls):
PATH = "/home/kim/Desktop/PY.dir/stu_01.dir/KimJH.xlsx"
SHEETNAME = "demographicsDashboard"
xlxsObj = pd.read_excel(PATH, sheet_name=SHEETNAME)
return xlxsObj
def main():
obj = PIE()
obj.doPie()
if __name__ == "__main__":
main()
'python > matplolib' 카테고리의 다른 글
matplotlib => line_chart + excel (0) | 2019.11.02 |
---|