data crawling
from bs4 import BeautifulSoup
from openpyxl import Workbook
from json import load
import requests
import time
from facebook.TargetString import TargetStr
class Facebook:
def __init__(self):
self.requestUrl = Facebook.jsnoFileRead()
self.bsObject = self.urlRequests()
self.targetString = TargetStr.target_string
self.wrkBook = Workbook() # 엑셀 파일 생성
@classmethod
def jsnoFileRead(cls):
try:
f = open("./info.json", "r")
json_doc = dict(load(f))
url = "{url}/{path}?{param}".format(
url = json_doc.get("url"),
path = json_doc.get("path"),
param = json_doc.get("param"))
except FileNotFoundError as e:
print (e)
f.close()
exit(1)
else:
f.close()
return url
def urlRequests(self):
html = requests.get(self.requestUrl)
if html.status_code == 200:
return BeautifulSoup(html.text, "html.parser")
else:
exit(1)
def urlParcing(self):
# work sheet 생성
wrkSheet = self.wrkBook.create_sheet("decoding_list")
table = self.bsObject.select("div#module-codecs > "
"div#standard-encodings > "
"table.docutils > "
"tbody > "
"tr")
encoding_list = [ t.select_one("td").string for t in table ]
for n, i in enumerate(encoding_list):
result_text = self.targetString.decode(i, "ignore")
# wrkSheet.cell(row=n+2, column=2).value = i
# wrkSheet.cell(row=n+2, column=3).value = str(result_text)
time.sleep(1)
print (i, result_text)
def __del__(self):
self.wrkBook.save("facebook.xlsx")
def main():
fnode = Facebook()
fnode.jsnoFileRead()
fnode.urlParcing()
if __name__ == "__main__":
main()
'언어 > python' 카테고리의 다른 글
selenium_ (0) | 2019.03.11 |
---|---|
2019년 3월 9일 ( 주말 프로젝트 ) (0) | 2019.03.09 |
python + 지하철 + 이미지 (0) | 2019.02.24 |
pysimplegui (0) | 2019.02.10 |
python + crawling + elasticsearch (0) | 2019.02.04 |