Day 11_Web Scraping

2019. 7. 17. 16:33
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
"""Day 11_Mission.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1ymzxvY3ZF8MwMY0a0cSrR48vwLtncAeQ
"""
 
from google.colab import drive
drive.mount('/gdrive')
 
PATH = "/gdrive/My Drive/Colab Notebooks/resources/"
 
# %matplotlib inline
 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import time
 
 
from scipy.stats import norm, multivariate_normal
# installing packages for interactive graphs
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
 
import urllib.parse
from bs4 import BeautifulSoup
 
##############################################
################ Helper Function##############
##############################################
 
def my_df_dropNas(df, columns):
    for col in columns:
        df =  df[ df[col].notna() ]
        
def my_checkNas( x ):
    y = x.apply(lambda x : (
    sum(x.isna())
))
    return y
 
@interact_manual( x =IntSlider(0,0,12) )
def test_model( x ):
    print(x)
 
###############################################
################## 코드 시작 ##################
###############################################
import urllib.parse
from bs4 import BeautifulSoup
 
values = {'stnId' : '184'} # 제주도
API = "http://www.weather.go.kr/weather/forecast/mid-term-rss3.jsp"
params =urllib.parse.urlencode(values)
 
url = API + "?" + params
 
data = urllib.request.urlopen(url).read()
text = data.decode('utf-8')
 
soup = BeautifulSoup(text, 'html.parser')
 
"""# 1 번 문제"""
 
def my_df_xml(soup, keyword):
    res = []
    for tag in soup.find_all(keyword):
        res.append(tag.string) 
        
    return pd.DataFrame(res)
 
## title tag 내용 추출
my_df_xml(soup, "title")[:2]
 
## wf 태그 내용 추출
my_df_xml(soup, "wf")[1:3]
 
## tmn 태그 내용 추출
df_tmn = my_df_xml(soup, "tmn")
df_tmn[:2]
 
## tmx 태그 내용 추출
df_tmx = my_df_xml(soup, "tmx")
df_tmx[:2]
 
res = pd.concat([df_tmn, df_tmx], axis = 1)
res[:2]
 
"""# 2번 문제"""
 
data = urllib.request.urlopen("https://ko.wikipedia.org/wiki/%ED%95%98%EB%8A%98%EA%B3%BC_%EB%B0%94%EB%9E%8C%EA%B3%BC_%EB%B3%84%EA%B3%BC_%EC%8B%9C").read()
text = data.decode('utf-8')
 
soup = BeautifulSoup(text, 'html.parser')
 
## 윤동주 시목록
list(map( lambda x : x.string, soup.select("div.mw-parser-output ul li")))[:-1]
 
def soup_url( url ):
    data = urllib.request.urlopen(url).read()
    data.decode(encoding = "utf-8", errors = "ignore")
 
    return BeautifulSoup(data, 'html.parser')
 
# 원 달러 환율
soup = soup_url("https://finance.naver.com/marketindex/exchangeDetail.nhn?marketindexCd=FX_USDKRW")
soup.select_one("#container > div.aside > div:nth-of-type(2) > table > tbody > tr:nth-of-type(1) td").text
 
## 크롤링할 페이지 갯수 ( 한개의 페이지만. )
pageNo = 1
 
## stackoverflow 각 페이지에 대한 soup객체 리스트.
soups = [ soup_url("https://stackoverflow.com/questions/tagged/python?tab=newest&page="+str(i)+"&pagesize=15")   for i in range( pageNo + 1 ) ]
 
baseUrl = "https://stackoverflow.com/"
 
## 페이지별 질문 링크 저장.
for soup in soups:
    threadUrls = [ baseUrl + tag.attrs['href'].strip()  for tag in soup.select("div.summary > h3 > a")]
 
## 페이지당 질문 글 50개
len(threadUrls)
 
##.....코드 상태가......................
soups_InPage = [ soup_url(thrad_url) for thrad_url in threadUrls ]
 
## 각 한줄한줄이 P태그에 있어서 다 텍스트로 합치고 줄바꿈 \n을 넣어줬습니다.
questions    = [ "\n".join(list(map(lambda x : x.text.strip(), soup.select("div.postcell div.post-text p"   ))))  for soup in soups_InPage ]
answers      = [ "\n".join(list(map(lambda x : x.text.strip(), soup.select("div.answercell div.post-text p" ))))  for soup in soups_InPage ]
 
len(questions) , len(answers)
 
 
Colored by Color Scripter
cs
저작자표시
'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

Day 13_bikeShare_DataSet (0)	2019.07.25
Day 12_RandomForeset_MushroomDataset (0)	2019.07.19
Day 10_DecisionTree_With_Preprocessing (0)	2019.07.17
Day 09_Advanced Group_Agg_Apply (0)	2019.07.12
Day 09_TimeSeries (0)	2019.07.12
Software knowledge worth spreading

Day 11_Web Scraping

'딥러닝 모델 설계 > Machine Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바