pdf 변환

언어/python2019. 12. 18. 21:14
from pdflib import Document
import os
import base64

from ela_dir.Ela import Ela

class PDFObj():


    def __init__(self):

        #Ela.__init__(self)

        self._targetPath="./pdf_dir"


    def dirSearch(self):

        os.chdir(self._targetPath)
        cur = os.listdir()

        for f in cur:

            fname, fext = os.path.splitext(f)

            if fext == ".pdf":

                doc = Document(f)
                print(doc.metadata)

                for c, p in enumerate(doc):

                    strData = " ".join(p.lines).strip()
                    #encodedBytes = base64.b64encode(strData.encode("utf-8"))
                    #encodedStr = str(encodedBytes, "utf-8")

                    #e = {"page_" : c+1,
                    #     "data_" : encodedStr}

                    e = {"page_" : c+1, "data_": strData}

                    print(e)



if __name__ == "__main__":

    o = PDFObj()
    o.dirSearch()

'언어 > python' 카테고리의 다른 글

네이버 python 지식인 답변  (0) 2020.06.06
21대 국회의원 선거 크롤링  (0) 2020.04.15
python으로 pdf 파일 read  (0) 2019.12.08
백준 2108  (0) 2019.12.08
from csv to json convert + logstash  (0) 2019.11.26

	static public void indexRemoveQuery(RestHighLevelClient param, String targetIndex) {
		
		DeleteIndexRequest deleteIndexRequest = new DeleteIndexRequest(targetIndex);
		try {
			
			param.indices().delete(deleteIndexRequest, RequestOptions.DEFAULT);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		System.out.println(targetIndex +" : index remove success");
	}

'언어 > java' 카테고리의 다른 글

정올 587번 문제  (0) 2020.04.06
변수 [ Java의 정석 ]  (0) 2020.04.06
elasticsearch java api : total index search  (0) 2019.04.22
elasticsearch java api search  (0) 2019.03.13
크롤링 => json 파일로 적재  (0) 2019.03.03

logstash + app-search

ELK/logstash2019. 12. 10. 14:55

input {
    file {
        start_position => "beginning"
        path           => ["/opt/logstash-7.2.0/conf/test.json"]
        sincedb_path   => "/dev/null"
        codec          => "json"
    }
}

filter {
    mutate {
        remove_field => ["path", "@version", "@timestamp", "host"]
    }
}

output {
    stdout {}
    elastic_app_search {
        api_key => "private-fobwwhsirra3cgner79khnhi"
        engine  => "kimjunhyeon"
        url     => "http://172.16.40.139:3002"
        path    => "/api/as/v1/"
        id      => "test_01"
    }
}

'ELK > logstash' 카테고리의 다른 글

logstash file json absolute path  (0) 2020.06.17
logstash input-plugin (elasticsearch)  (0) 2020.05.12
logstash plugin install  (0) 2020.04.15
logstash + python  (0) 2019.01.12
logstash - mysql - elasticsearch 연동  (0) 2019.01.06

from pdflib import Document
import os
import base64

class PDFObj():


    def __init__(self):

        self._targetPath="./pdf_dir"


    def dirSearch(self):

        os.chdir(self._targetPath)
        cur = os.listdir()

        for f in cur:

            fname, fext = os.path.splitext(f)

            if fext == ".pdf":

                doc = Document(f)
                print(doc.metadata)

                for c, p in enumerate(doc):

                    print("{} ========================".format(p))
                    strData = " ".join(p.lines).strip()
                    encodedBytes = base64.b64encode(strData.encode("utf-8"))
                    encodedStr = str(encodedBytes, "utf-8")
                    print(encodedStr)
                    print(strData)

                    if c == 3:
                        exit(1)


if __name__ == "__main__":

    o = PDFObj()
    o.dirSearch()

 

 

테스트 환경

 => ubuntu 18.4

 => interpreter 3.6

'언어 > python' 카테고리의 다른 글

21대 국회의원 선거 크롤링  (0) 2020.04.15
pdf 변환  (0) 2019.12.18
백준 2108  (0) 2019.12.08
from csv to json convert + logstash  (0) 2019.11.26
네이버 기사 크롤링 => elasticsearch 적재  (0) 2019.07.12

백준 2108

언어/python2019. 12. 8. 18:23
from operator import itemgetter
from collections import Counter
import sys

class Q_2108():

    def __init__(self):

        self._numCnt = 0
        self._numList = list()
        self._totalSum = 0

    def numSetting(self):

        self._numCnt = int(sys.stdin.readline())

        for _ in range(self._numCnt):
            e = int(sys.stdin.readline())
            self._totalSum += e
            self._numList.append(e)

        # 데이터 정렬
        self._numList.sort()

    def arithMean(self):

        avr = round((float(self._totalSum)/len(self._numList)))
        print(avr)


    def middleNumber(self):

        print(self._numList[int(len(self._numList)/2)])


    def largeCnt(self):

        modeDict = Counter(self._numList)
        modes = modeDict.most_common()

        if len(self._numList) > 1:

            if modes[0][1] == modes[1][1]:
                mod = modes[1][0]
            else:
                mod = modes[0][0]
        else:
            mod = modes[0][0]

        print(mod)

        # keyNum = set(self._numList)
        # numCnt = [(k,self._numList.count(k)) for k in keyNum]
        #
        # # value값을 기준으로 정렬
        # numCnt.sort(key= lambda e:(e[1], e[0]), reverse=True)
        #
        # # list는 순서를 보존한다.
        # rawV     = [i[1] for i in numCnt]
        # largeNum = rawV[0]
        # largeCnt = rawV.count(largeNum)
        # largeArr = [i[0] for i in numCnt[:largeCnt]]
        #
        # if len(largeArr) == 1:
        #     print(largeArr[0])
        #
        # elif len(largeArr) > 1:
        #     if len(largeArr) == 2:
        #         print(largeArr[1])
        #     else:
        #         print(largeArr[len(largeArr)-2])

    def numRange(self):

        print(self._numList[len(self._numList)-1] - self._numList[0])


if __name__ == "__main__":
    q = Q_2108()
    q.numSetting()
    q.arithMean()
    q.middleNumber()
    q.largeCnt()
    q.numRange()

'언어 > python' 카테고리의 다른 글

pdf 변환  (0) 2019.12.18
python으로 pdf 파일 read  (0) 2019.12.08
from csv to json convert + logstash  (0) 2019.11.26
네이버 기사 크롤링 => elasticsearch 적재  (0) 2019.07.12
naver music 크롤링 + elasticsearch  (0) 2019.05.22

from ELa import Ela

class Client:


    def __init__(self):

        self._elaClinet = Ela.retElanode()
        self._targetDir = ""

    def insrtDoc(self):

        d = {"data":"aGVsbG8="}

        self._elaClinet.index(index="x_index", pipeline="attachment", body=d)

o = Client()
o.insrtDoc()

 

import csv
import json
import os
import shutil
import subprocess
import time

#

class ConvJson():


    def __init__(self):

        # 대상 디렉토리
        self._targetPath = r""

        # json 이동 디렉토리
        self._jsonPath = "jsonDir"

        # field_name
        self._fieldName = ("a", "b", "c")
     

        # logstash path
        self._logstashRun   = ""
        self._targetConfDir = ""


    def csvRead(self):

        # 디렉토리 이동
        os.chdir(self._targetPath)
        # file list
        fileList = os.listdir()

        for f in fileList:
            fileAbsPath = os.path.abspath(f)
            fileName, fileExtension = os.path.splitext(fileAbsPath)

            if fileExtension == ".csv":

                try:

                    csvFile = open(fileAbsPath, "r", encoding="utf-8")
                    next(csvFile)

                except FileNotFoundError as E:
                    print(E)
                    exit(1)
                else:

                    """ ndjson 파일로 변환
                    """
                    try:

                        jsonFile = open("{}.json".format(fileName), "w", encoding="utf-8")
                    except:
                        print("file error")
                        exit(1)
                    else:

                        reader = csv.DictReader(f=csvFile, fieldnames=self._fieldName, delimiter="|")
                        data   = list(reader)

                        for x in range(0, len(data)):
                            # json.dump(data, fp=jsonFile)

                            if x != len(data)-1:
                                strData = json.dumps(data[x]) + "\n"

                            else:
                                strData = json.dumps(data[x])

                            jsonFile.write(strData)

                        jsonFile.close()
                        csvFile.close()


    def jsonFileMov(self):

        # 디렉토리 이동
        os.chdir(self._targetPath)

        # file list
        fileList = os.listdir()

        for f in fileList:
            fileAbsPath = os.path.abspath(f)
            fileName, fileExtension = os.path.splitext(fileAbsPath)

            if fileExtension == ".json":

                try:

                    shutil.move("{}.json".format(fileName), self._targetPath + "\\" + self._jsonPath)
                except OSError as E:
                    print("파일 이동 에러")
                    print(E)
                    exit(1)
                else:

                    print ("이동 성공")


    def logstashInsert(self):

        # 디렉토리 이동
        os.chdir(self._targetPath + "\\" + self._jsonPath)

        # file list
        fileList = os.listdir()

        for f in fileList:
            fileAbsPath = os.path.abspath(f)

            command = self._logstashRun + " -f " + self._targetConfDir + " < " + fileAbsPath
            p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
            stdout, _ = p.communicate()
            print(stdout)

            print("indexing complate")
            print("=======================================================")
            time.sleep(5)

def main():

    o = ConvJson()
    o.csvRead()
    o.jsonFileMov()

if __name__ == "__main__":
    main()

'언어 > python' 카테고리의 다른 글

python으로 pdf 파일 read  (0) 2019.12.08
백준 2108  (0) 2019.12.08
네이버 기사 크롤링 => elasticsearch 적재  (0) 2019.07.12
naver music 크롤링 + elasticsearch  (0) 2019.05.22
네이버 뉴스 크롤링 + 형태소  (0) 2019.05.01

input {
    stdin { }
}

filter {
    json {
        source => "message"
    }
    mutate {
        remove_field => ["@version", "host", "message", "path"]
    }
}

output {
    stdout { codec => rubydebug }
    elasticsearch {
        hosts => ["http://192.168.42.136:9200"]
        index => nvr_movie
    }
}

logstash 코드 

 

 

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import json 

from Utils import Utils

#
# author : JunHyeon.Kim
# date   : 20191110
# ver    : 0.1
# naver move data cwling
#
#

class MV:

    def __init__(self):
        
        INFO_           = Utils.getSetting()  
        self._cllTime   = Utils.getCllTime()
        self._category  = Utils.getCategory() 
        self._totalData = list()         
        self._urlInfo   = {"url" : INFO_["url"], "path": INFO_["path"], "time": self._cllTime}

    def urlRequests(self):
        
        for i in self._category["category"]:
                         
            urlArgs_ = urlencode(
                {
                    "sel" : "cnt", 
                    "date": self._urlInfo["time"], 
                    "tg"  : i 
                }
            )

            print (urlArgs_,  self._category["category"][i])

            requestUrl_ = self._urlInfo["url"] + self._urlInfo["path"] +"?"+ urlArgs_ 
                
            try:

                htmlObj = requests.get(requestUrl_)
            except requests.exceptions.ConnectionError as E:
                print (E)
                exit(1)
            else:
                
                if htmlObj.status_code == 200:

                    bsObj  = BeautifulSoup(htmlObj.text, "html.parser")

                    titles = bsObj.select("td.title > div.tit3 > a")
                
                    with open("./nvr_movie_"+ self._cllTime +"_.json", "a", encoding="utf-8") as f:
                        
                        for c, t in enumerate(titles):
                            
                            d = {
                                "title"  : t.attrs["title"], 
                                "rank"   : c+1,
                                "clltime": self._cllTime, 
                                "genr"   : self._category["category"][i] } 

                            f.write(json.dumps(d ,ensure_ascii=False) + "\n")

                        f.close()
                        
def main():

    mvObj = MV()
    mvObj.urlRequests() 

if __name__ == "__main__":
    main()
        

'ELK > elasticsearch' 카테고리의 다른 글

python-appsearch  (0) 2019.12.19
Elasticsearch + python + pipeline  (0) 2019.12.02
python + elasticsearch : csv => bulk json 변환  (0) 2019.10.23
elasticsearch SSL 적용 connect code + python  (0) 2019.10.22
logstash_01 / json  (0) 2019.10.19

import pandas as pd
import matplotlib.pyplot as plt

class LineChart:


    def __init__(self):

        self._xlsx    = LineChart.getConfig()
        self._xvalues = list()
        self._yvalues = list()


    def doLine(self):

        for i in self._xlsx.index:
            rowData = self._xlsx["날짜"][i], self._xlsx["방문자수"][i]
            self._xvalues.append(rowData[0])
            self._yvalues.append(rowData[1])

        df = pd.DataFrame({"xvalues" : self._xvalues, "yvalues" : self._yvalues})

        # plot
        plt.plot("xvalues", "yvalues", data=df, color="g")
        plt.xlabel("year")
        plt.ylabel("count")
        plt.title("tistory my count")
        plt.show()


    @classmethod
    def getConfig(cls):

        PATH      = "/home/kim/Desktop/PY.dir/stu_01.dir/KimJH_uv.xlsx"
        SHEETNAME = "uv"
        xlxsObj = pd.read_excel(PATH, sheet_name=SHEETNAME)

        return xlxsObj

def main():

    obj = LineChart()
    obj.doLine()

if __name__ == "__main__":
    main()

'python > matplolib' 카테고리의 다른 글

matplotlib => pie_chart + xlsx  (0) 2019.11.02

import pandas as pd
import matplotlib.pyplot as plt

class PIE:

    def __init__(self):

        self._xlsx = PIE.getConfig()
        self._category = {"남자" : 0, "여자" : 0}

    def doPie(self):

        for i in self._xlsx.index:
            rowData = self._xlsx["성별"][i], self._xlsx["방문자수"][i]
            self._category[rowData[0]] += rowData[1]

        category         = ["man", "woman"]
        categorySize     = [self._category["남자"], self._category["여자"]]
        categoryColors   = ["yellow", "green"]
        categoryExplodes = (0.1, 0)

        plt.pie(categorySize,
                explode      = categoryExplodes,
                labels       = category,
                colors       = categoryColors,
                autopct      = '%1.2f%%',
                shadow       = True,
                startangle   = 90,
                textprops    = {"fontsize" : 14})  # text font size
        plt.axis("equal")
        plt.show()

    @classmethod
    def getConfig(cls):

        PATH      = "/home/kim/Desktop/PY.dir/stu_01.dir/KimJH.xlsx"
        SHEETNAME = "demographicsDashboard"
        xlxsObj = pd.read_excel(PATH, sheet_name=SHEETNAME)

        return xlxsObj

def main():

    obj = PIE()
    obj.doPie()

if __name__ == "__main__":
    main()

 

 

 

'python > matplolib' 카테고리의 다른 글

matplotlib => line_chart + excel  (0) 2019.11.02