1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | # -*- coding: utf-8 -*- """Day 11_Mission.ipynb Automatically generated by Colaboratory. Original file is located at """ from google.colab import drive drive.mount('/gdrive') PATH = "/gdrive/My Drive/Colab Notebooks/resources/" # %matplotlib inline import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import pandas as pd import seaborn as sns import time from scipy.stats import norm, multivariate_normal # installing packages for interactive graphs import ipywidgets as widgets from IPython.display import display from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider import urllib.parse from bs4 import BeautifulSoup ############################################## ################ Helper Function############## ############################################## def my_df_dropNas(df, columns): for col in columns: df = df[ df[col].notna() ] def my_checkNas( x ): y = x.apply(lambda x : ( sum(x.isna()) )) return y @interact_manual( x =IntSlider(0,0,12) ) def test_model( x ): print(x) ############################################### ################## 코드 시작 ################## ############################################### import urllib.parse from bs4 import BeautifulSoup values = {'stnId' : '184'} # 제주도 API = "http://www.weather.go.kr/weather/forecast/mid-term-rss3.jsp" params =urllib.parse.urlencode(values) url = API + "?" + params data = urllib.request.urlopen(url).read() text = data.decode('utf-8') soup = BeautifulSoup(text, 'html.parser') """# 1 번 문제""" def my_df_xml(soup, keyword): res = [] for tag in soup.find_all(keyword): res.append(tag.string) return pd.DataFrame(res) ## title tag 내용 추출 my_df_xml(soup, "title")[:2] ## wf 태그 내용 추출 my_df_xml(soup, "wf")[1:3] ## tmn 태그 내용 추출 df_tmn = my_df_xml(soup, "tmn") df_tmn[:2] ## tmx 태그 내용 추출 df_tmx = my_df_xml(soup, "tmx") df_tmx[:2] res = pd.concat([df_tmn, df_tmx], axis = 1) res[:2] """# 2번 문제""" data = urllib.request.urlopen("https://ko.wikipedia.org/wiki/%ED%95%98%EB%8A%98%EA%B3%BC_%EB%B0%94%EB%9E%8C%EA%B3%BC_%EB%B3%84%EA%B3%BC_%EC%8B%9C").read() text = data.decode('utf-8') soup = BeautifulSoup(text, 'html.parser') ## 윤동주 시목록 list(map( lambda x : x.string, soup.select("div.mw-parser-output ul li")))[:-1] def soup_url( url ): data = urllib.request.urlopen(url).read() data.decode(encoding = "utf-8", errors = "ignore") return BeautifulSoup(data, 'html.parser') # 원 달러 환율 soup = soup_url("https://finance.naver.com/marketindex/exchangeDetail.nhn?marketindexCd=FX_USDKRW") soup.select_one("#container > div.aside > div:nth-of-type(2) > table > tbody > tr:nth-of-type(1) td").text ## 크롤링할 페이지 갯수 ( 한개의 페이지만. ) pageNo = 1 ## stackoverflow 각 페이지에 대한 soup객체 리스트. soups = [ soup_url("https://stackoverflow.com/questions/tagged/python?tab=newest&page="+str(i)+"&pagesize=15") for i in range( pageNo + 1 ) ] baseUrl = "https://stackoverflow.com/" ## 페이지별 질문 링크 저장. for soup in soups: threadUrls = [ baseUrl + tag.attrs['href'].strip() for tag in soup.select("div.summary > h3 > a")] ## 페이지당 질문 글 50개 len(threadUrls) ##.....코드 상태가...................... soups_InPage = [ soup_url(thrad_url) for thrad_url in threadUrls ] ## 각 한줄한줄이 P태그에 있어서 다 텍스트로 합치고 줄바꿈 \n을 넣어줬습니다. questions = [ "\n".join(list(map(lambda x : x.text.strip(), soup.select("div.postcell div.post-text p" )))) for soup in soups_InPage ] answers = [ "\n".join(list(map(lambda x : x.text.strip(), soup.select("div.answercell div.post-text p" )))) for soup in soups_InPage ] len(questions) , len(answers) | cs |
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글
Day 13_bikeShare_DataSet (0) | 2019.07.25 |
---|---|
Day 12_RandomForeset_MushroomDataset (0) | 2019.07.19 |
Day 10_DecisionTree_With_Preprocessing (0) | 2019.07.17 |
Day 09_Advanced Group_Agg_Apply (0) | 2019.07.12 |
Day 09_TimeSeries (0) | 2019.07.12 |