app-search
ELK/elasticsearch2020. 1. 6. 11:30
from pdflib import Document
import os
import base64
import yaml
import time
from time import strftime
# app-search
from swiftype_app_search import Client
from ela_dir.Ela import Ela
#
# pdf 파일 읽어 app-search 에 insert
#
class AppSearch():
def __init__(self):
ARGS = AppSearch.getAppObj()
self._appClient = ARGS.get("client")
self._appEngine = ARGS.get("engine_name")
@classmethod
def getAppObj(cls):
try:
f=open("./app_search_info/app_info.yml", "r", encoding="utf-8")
except FileExistsError as E:
print(E)
exit(1)
else:
appArgs = yaml.safe_load(f)
arguments = dict()
client = Client(
api_key = appArgs.get("api_key"),
base_endpoint = appArgs.get("base_endpoint"),
use_https = appArgs.get("use_https")
)
engine_name = appArgs.get("engine_name")
arguments["client"] = client
arguments["engine_name"] = engine_name
return arguments
class PDFObj(AppSearch):
def __init__(self):
AppSearch.__init__(self)
self._targetPath = PDFObj.getFilePath()
self._fileTypeList = [".pdf"]
self._timeObj = strftime("%Y%m%d", time.localtime())
# ftp로 들어온 파일을 순회
def dirSearch(self):
os.chdir(self._targetPath)
cur = os.listdir()
for f in cur:
fname, fext = os.path.splitext(f)
if fext in self._fileTypeList:
doc = Document(f)
#text = []
for page, content in enumerate(doc):
print("{} 처리 중 ...".format(page+1))
strData = " ".join(content.lines).strip()
#text.append(strData)
element = {"metadata": doc.metadata, "fileName": fname, "content": strData, "cllctTime": self._timeObj,
"filepath": os.path.abspath(f)}
self._appClient.index_document(self._appEngine, element)
#resultContent = "".join(text)
@classmethod
def getFilePath(cls):
try:
f = open("./conf/info.yml", "r", encoding="utf-8")
except FileNotFoundError as E:
print(E)
exit(1)
else:
filePath = yaml.safe_load(f)
return filePath.get("target_path")
if __name__ == "__main__":
o = PDFObj()
o.dirSearch()
'ELK > elasticsearch' 카테고리의 다른 글
reindex query shellscript array (0) | 2020.02.03 |
---|---|
대상 인덱스의 field 모두를 fielddata true로 변환하는 방법 (0) | 2020.01.17 |
nginx setting (0) | 2019.12.19 |
python-appsearch (0) | 2019.12.19 |
Elasticsearch + python + pipeline (0) | 2019.12.02 |