app-search

ELK/elasticsearch2020. 1. 6. 11:30
from pdflib import Document
import os
import base64
import yaml

import time
from time import strftime

# app-search
from swiftype_app_search import Client

from ela_dir.Ela import Ela

#
# pdf 파일 읽어 app-search 에 insert
#

class AppSearch():


    def __init__(self):

        ARGS = AppSearch.getAppObj() 
        self._appClient = ARGS.get("client")
        self._appEngine = ARGS.get("engine_name")

    @classmethod
    def getAppObj(cls):
        
        try:

            f=open("./app_search_info/app_info.yml", "r", encoding="utf-8")
        except FileExistsError as E:
            print(E)
            exit(1)
        else:
            
            appArgs   = yaml.safe_load(f)
            arguments = dict()

            client  = Client(
                api_key       = appArgs.get("api_key"), 
                base_endpoint = appArgs.get("base_endpoint"), 
                use_https     = appArgs.get("use_https")
            )
            
            engine_name = appArgs.get("engine_name")

            arguments["client"] = client
            arguments["engine_name"] = engine_name

            return arguments 


class PDFObj(AppSearch):


    def __init__(self):

        AppSearch.__init__(self)
        self._targetPath = PDFObj.getFilePath()
        self._fileTypeList = [".pdf"]
        self._timeObj = strftime("%Y%m%d", time.localtime())

    # ftp로 들어온 파일을 순회
    def dirSearch(self):

        os.chdir(self._targetPath)
        cur = os.listdir()

        for f in cur:
            
            fname, fext = os.path.splitext(f)

            if fext in self._fileTypeList:

                doc = Document(f)
                #text = []
                for page, content in enumerate(doc):
                    
                    print("{} 처리 중 ...".format(page+1))

                    strData = " ".join(content.lines).strip()
                    #text.append(strData)
                
                    element = {"metadata": doc.metadata, "fileName": fname, "content": strData, "cllctTime": self._timeObj,
                               "filepath": os.path.abspath(f)}
                    self._appClient.index_document(self._appEngine, element)

                #resultContent = "".join(text) 

    @classmethod
    def getFilePath(cls):

        try:

            f = open("./conf/info.yml", "r", encoding="utf-8")
        except FileNotFoundError as E:
            print(E)
            exit(1)
        else:

            filePath = yaml.safe_load(f)
            return filePath.get("target_path")

if __name__ == "__main__":

    o = PDFObj()
    o.dirSearch()

'ELK > elasticsearch' 카테고리의 다른 글

reindex query shellscript array  (0) 2020.02.03
대상 인덱스의 field 모두를 fielddata true로 변환하는 방법  (0) 2020.01.17
nginx setting  (0) 2019.12.19
python-appsearch  (0) 2019.12.19
Elasticsearch + python + pipeline  (0) 2019.12.02