#_____________________________________________
import requests
from bs4 import BeautifulSoup
import pprint as ppr
import re
import pymongo
from openpyxl import load_workbook
#_____________________________________________
class Web_crawling:
def __init__(self, url):
self.req_data = requests.get(url) # insert url data
self.html_data = self.req_data.text
self.soup_data = BeautifulSoup(self.html_data, "html.parser")
self.re_sorting_1 = re.compile("http*")
self.re_sorting_2 = re.compile(".jpg")
self.Image_url = list()
self.wb_excel = load_workbook('C:\\Users\\sleep\\Desktop\\stu.xlsx')
def HTML_INFOMATION(self):
#print (self.soup_data)
pass
def Want_data_sorting(self):
ws = self.wb_excel.create_sheet(title="Test_url")
tmp_value = self.soup_data.findAll("img")
for Link in tmp_value:
Link = str(Link)
LeftIndex = self.re_sorting_1.search(Link)
RigthIndex = self.re_sorting_2.search(Link)
try:
indx_L= LeftIndex.span()
indx_R= RigthIndex.span()
self.Image_url.append(Link[indx_L[0]:indx_R[1]])
except:
pass
ppr.pprint (self.Image_url)
Indx = 1
for v in self.Image_url: # e_cp
In_alphabet = 66
# print (k, type(v)) # , dictionary
s = chr(65) + str(Indx)
print(s)
ws[s] = v
s = chr(In_alphabet) + str(Indx)
print(s)
ws[s] = v
In_alphabet += 1
Indx += 1
def __del__(self):
self.wb_excel.save('C:\\Users\\sleep\\Desktop\\stu.xlsx')
def main():
me_craw = Web_crawling("http://www.goodhousekeeping.com/home/decorating-ideas/tips/g3902/interior-paint-colors")
me_craw.HTML_INFOMATION()
me_craw.Want_data_sorting()
if __name__ == "__main__":
main()