인구 크롤링

언어/python2018. 9. 26. 16:21

target site : https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%9D%B8%EA%B5%AC%EC%88%9C_%EB%8F%84%EC%8B%9C_%EB%AA%A9%EB%A1%9D


# __________________________________

import requests as req

from bs4 import BeautifulSoup

import pandas as pd

import re

# __________________________________

target_url = "https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%9D%B8%EA%B5%AC%EC%88%9C_%EB%8F%84%EC%8B%9C_%EB%AA%A9%EB%A1%9D"


html = req.get(url=target_url)

adDict = {} # type of list

if html.status_code == 200 and html.ok:

    bs_obj = BeautifulSoup(html.text, "html.parser")


    # print (bs_obj)


    """

    #mw-content-text > div > table > tbody > tr:nth-child(1) > td:nth-child(2) > a

    #mw-content-text > div > table > tbody > tr:nth-child(2) > td:nth-child(2) > a

    ...

    #mw-content-text > div > table > tbody > tr:nth-child(85) > td:nth-child(2) > a:nth-child(2)

    #mw-content-text > div > table > tbody > tr:nth-child(80) > td:nth-child(2) > a:nth-child(2)

    인구 

    : #mw-content-text > div > table > tbody > tr:nth-child(1) > td:nth-child(4)

    """


    sIndex = 0x2

    while True:

        start = bs_obj.select("#mw-content-text > div > table > tbody > tr:nth-of-type({})".format(sIndex))

        if start == []:

            break

        else: # s != None

            name = ""

            for i in start:

                n, m = i.select("td:nth-of-type(2) > a"), i.select("td:nth-of-type(4)")

                """

                n: 이름 , m: 인구 

                """

                for info in n:

                # print ("{0:3d}: {1:s}".format(sIndex+1, i.string))

                    name += (info.string + ' ')

            name = name.rstrip(' ')

            # print (name, m[0].string)

            sIndex += 1

            menCnt = re.sub(',','',m[0].string)

            adDict[name] = int(menCnt, 10)


    Lst = [ x for x in adDict.items() ]

    df = pd.DataFrame(data=Lst, columns=['Name', 'Cnt'])

    print (df)

'언어 > python' 카테고리의 다른 글

로컬 피시 ip 확인 - 파이썬  (0) 2018.10.06
파이썬 에러 목록  (0) 2018.09.30
Backdoor  (0) 2018.09.19
windll.kernel32.lstrcmpW  (0) 2018.09.11
from_python_to_c_02  (0) 2018.09.09