2023.06.22

카테고리 없음

2023.06.22

이경찬 :) 2023. 6. 22. 17:06

누락된 정보 채우기

na_rows = ns_book5['도서명'].isna() | ns_book5['저자'].isna() \
            | ns_book5['출판사'].isna() | ns_book5['발행년도'].eq(-1)
print(na_rows.sum())
ns_book5[na_rows].head(2)

import requests
from bs4 import BeautifulSoup

def get_book_title(isbn):
    url = 'http://www.yes24.com/Product/Search?domain=BOOK&query={}'
    r = requests.get(url.format(isbn))
    soup = BeautifulSoup(r.text, 'html.parser')
    title = soup.find('a', attrs = {'class':'gd_name'}) \
                        .get_text()
    return title
    
get_book_title(9791191266054)

import re

def get_book_info(row):
    
    title = row['도서명']
    author = row['저자']
    pub = row['출판사']
    year = row['발행년도']
    
    url = 'http://www.yes24.com/Product/Search?domain=BOOK&query={}'
    r = requests.get(url.format(row['ISBN']))
    soup = BeautifulSoup(r.text, 'html.parser')
    try:
        if pd.isna(title):
            title = soup.find('a', attrs = {'class': 'gd_name'}) \
                    .get_text()
    except AttributeError:
        pass
    
    try:
        if pd.isna(author):
            authors = soup.find('span', attrs = {'class' : 'info_auth'}) \
                        .find_all('a')
            author_list = [auth.get_text() for auth in authors]
            author = ','.join(author_list)
    except AttributeError:
        pass
    
    try:
        if pd.isna(pub):
            pub = soup.find('span', attrs = {'class': 'info_pub'}) \
                    .find('a') \
                    .get_text()
    except AttributeError:
        pass
    
    try:
        if year == -1:
            year = soup.find('span', attrs = {'class': 'info_date'}) \
                    .get_text()
            year = re.findall(r'\d{4}', year_str)[0]
    except AttributeError:
        pass

    return title,author,pub,year

updated_sample = ns_book5[na_rows].head(2).apply(get_book_info, axis = 1, result_type = 'expand')
updated_sample

ns_book5.loc[78,:]

updated_sample = ns_book5[na_rows].head(2).apply(get_book_info, axis = 1, result_type = 'expand')
updated_sample

data_fixing

def data_fixing(ns_book4):
    """
    잘못된 값을 수정하거나 NaN 값을 채우는 함수

    :param ns_book4: data_cleaning() 함수에서 전처리된 데이터프레임
    """
    # 도서권수와 대출건수를 int32로 바꿉니다.
    ns_book4 = ns_book4.astype({'도서권수':'int32', '대출건수': 'int32'})
    # NaN인 세트 ISBN을 빈문자열로 바꿉니다.
    set_isbn_na_rows = ns_book4['세트 ISBN'].isna()
    ns_book4.loc[set_isbn_na_rows, '세트 ISBN'] = ''

    # 발행년도 열에서 연도 네 자리를 추출하여 대체합니다. 나머지 발행년도는 -1로 바꿉니다.
    ns_book5 = ns_book4.replace({'발행년도': '.*(\d{4}).*'}, r'\1', regex=True)
    unkown_year = ns_book5['발행년도'].str.contains('\D', na=True)
    ns_book5.loc[unkown_year, '발행년도'] = '-1'

    # 발행년도를 int32로 바꿉니다.
    ns_book5 = ns_book5.astype({'발행년도': 'int32'})
    # 4000년 이상인 경우 2333년을 뺍니다.
    dangun_yy_rows = ns_book5['발행년도'].gt(4000)
    ns_book5.loc[dangun_yy_rows, '발행년도'] = ns_book5.loc[dangun_yy_rows, '발행년도'] - 2333
    # 여전히 4000년 이상인 경우 -1로 바꿉니다.
    dangun_year = ns_book5['발행년도'].gt(4000)
    ns_book5.loc[dangun_year, '발행년도'] = -1
    # 0~1900년 사이의 발행년도는 -1로 바꿉니다.
    old_books = ns_book5['발행년도'].gt(0) & ns_book5['발행년도'].lt(1900)
    ns_book5.loc[old_books, '발행년도'] = -1

    # 도서명, 저자, 출판사가 NaN이거나 발행년도가 -1인 행을 찾습니다.
    na_rows = ns_book5['도서명'].isna() | ns_book5['저자'].isna() \
              | ns_book5['출판사'].isna() | ns_book5['발행년도'].eq(-1)
    # 교보문고 도서 상세 페이지에서 누락된 정보를 채웁니다.
    updated_sample = ns_book5[na_rows].apply(get_book_info,
        axis=1, result_type ='expand')
    updated_sample.columns = ['도서명','저자','출판사','발행년도']
    ns_book5.update(updated_sample)

    # 도서명, 저자, 출판사가 NaN이거나 발행년도가 -1인 행을 삭제합니다.
    ns_book6 = ns_book5.dropna(subset=['도서명','저자','출판사'])
    ns_book6 = ns_book6[ns_book6['발행년도'] != -1]

    return ns_book6