target site : https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%9D%B8%EA%B5%AC%EC%88%9C_%EB%8F%84%EC%8B%9C_%EB%AA%A9%EB%A1%9D
# __________________________________
import requests as req
from bs4 import BeautifulSoup
import pandas as pd
import re
# __________________________________
target_url = "https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%9D%B8%EA%B5%AC%EC%88%9C_%EB%8F%84%EC%8B%9C_%EB%AA%A9%EB%A1%9D"
html = req.get(url=target_url)
adDict = {} # type of list
if html.status_code == 200 and html.ok:
bs_obj = BeautifulSoup(html.text, "html.parser")
# print (bs_obj)
"""
#mw-content-text > div > table > tbody > tr:nth-child(1) > td:nth-child(2) > a
#mw-content-text > div > table > tbody > tr:nth-child(2) > td:nth-child(2) > a
...
#mw-content-text > div > table > tbody > tr:nth-child(85) > td:nth-child(2) > a:nth-child(2)
#mw-content-text > div > table > tbody > tr:nth-child(80) > td:nth-child(2) > a:nth-child(2)
인구
: #mw-content-text > div > table > tbody > tr:nth-child(1) > td:nth-child(4)
"""
sIndex = 0x2
while True:
start = bs_obj.select("#mw-content-text > div > table > tbody > tr:nth-of-type({})".format(sIndex))
if start == []:
break
else: # s != None
name = ""
for i in start:
n, m = i.select("td:nth-of-type(2) > a"), i.select("td:nth-of-type(4)")
"""
n: 이름 , m: 인구
"""
for info in n:
# print ("{0:3d}: {1:s}".format(sIndex+1, i.string))
name += (info.string + ' ')
name = name.rstrip(' ')
# print (name, m[0].string)
sIndex += 1
menCnt = re.sub(',','',m[0].string)
adDict[name] = int(menCnt, 10)
Lst = [ x for x in adDict.items() ]
df = pd.DataFrame(data=Lst, columns=['Name', 'Cnt'])
print (df)
'언어 > python' 카테고리의 다른 글
로컬 피시 ip 확인 - 파이썬 (0) | 2018.10.06 |
---|---|
파이썬 에러 목록 (0) | 2018.09.30 |
Backdoor (0) | 2018.09.19 |
windll.kernel32.lstrcmpW (0) | 2018.09.11 |
from_python_to_c_02 (0) | 2018.09.09 |