python 4주차 복습-3

IT&코딩/python

python 4주차 복습-3

솔론 2022. 7. 31. 04:39

728x90

cmd에 pip install requests 치고

pip list 로 확인

네트워크란? 데이터를 주고받는 것

client : 데이터를 요청하는 주체

server : 데이터를 제공하는 주체

request : 데이터를 요청하는 행위

response : 데이터를 제공하는 행위

웹 브라우저 : 인터넷하려고 더블클릭하는 것!

requests.get(url)

url 로 requests 를 보냄, response 를 반환합니다.

res=requests.get("https://www.naver.com")

print(res)

# # 얘를 타입 찍어보면

print(type(res)) # <class 'requests.models.Response'>

print(res.text) # HTML 소스

# 웹 크롤링 : 웹에서 데이터 따오는 것

# HTML 소스를 긁어오는 것

1.requests lib

------------------------

import requests

res=requests.get(url)

------------------------

url 로 request 보내주고, response 를 받음

res 는 response class 의 인스턴스

- res.text (HTML소스)

-------------------------------------------------------------------------------------------------------------

HTML (웹 프로그래밍 언어)

1) 태그

꺽쇠, 꺽쇠가 시작하고 공백전까지가 태그라고 보면 된다.

여는 태그 <>. 닫는태그 </>

2) 속성 : 등호를 기준으로 왼쪽이 속성

3) 속성값 : 태그를 부연설명 해준다 : 공백 이후 등호 기준 오른쪽이 속성값

4) 텍스트 : 꺽쇠 안에 없는 부분. 외부로 노출

파이썬에서는 하지만 HTML 코드를 구분할 수 없다.

파이썬은 웹의 언어를 이해하지 못하기 때문에. res.text는 다 문자열로 인식된다

파싱 : 규격에 맞춰서 해석함

파싱을 위해 pip install beautifulsoup4 로 설치 후, pip list로 설치확인.

T가 주신 코드 여러줄 문자열 처리

from bs4 import BeautifulSoup

st="""

</div>

NAVER로가기

</body>

</html>

"""

soup=BeautifulSoup(st,"html.parser")

# print(soup.select_one("div"))

# 만약에 id 가져오겠다면 속성값 앞에 # 붙여준다.

print(soup.select_one("#hello1"))

print(soup.select_one(".hello2")) # 클래스가 hello2인 태그

print(soup.select_one(".d"))

print(soup.select_one("#c>.d")) # 이런 표현도 가능 아이디가 c인 태그 중에 class가 d인 태그

# 출발점은 중요하지 않지만 중간에 끊기지 않아야 한다.

print(soup.select_one("html>div>span")) # body가 없어서 이어지지 않는다

# BeautifulSoup(아무 의미없는 문자열, 파서)
# 아무의미없는 문자열을 파서의 형태로 이해하고 있는 통역가 생성

BeautifulSoup(아무 의미없는 문자열, 파서)

아무의미없는 문자열을 파서의 형태로 이해하고 있는 통역가 생성

2. 아무 의미없다 in python > 파싱

==================================================

from bs4 import BeautifulSoup

soup=BeautifulSoup(문자열, "html.parser")

==================================================

문자열을 html 형식으로 이해하고 있는 soup 을 생성 # 통역가를 생성.

3. soup.select (태그들)

soup.select_one (태그) 처음에 걸리는 하나만 가져온다.

태그를 지칭하는 방법(셀렉터)

1) 태그 그대로

2) id #

3) class .

4) 하위태그 > # 하위태그는 어떤 태그가 끝나기 전에 들어가 있는 태그

-------------------------------------------------------------------------------------------------------------

# 참교육 90화 페이지가서

# 제목, 평점, 참여자수, 등록일

import requests

from bs4 import BeautifulSoup

res=requests.get("https://comic.naver.com/webtoon/detail?titleId=758037&no=90&weekday=mon")

# print(res.text)

soup=BeautifulSoup(res.text,"html.parser")

print(soup.select_one(".view>h3")) # 제목

print(soup.select_one("#topPointTotalNumber>strong")) # 평점

print(soup.select_one(".pointTotalPerson>em")) # 참여자수

print(soup.select_one(".date")) # 등록일

-------------------------------------------------------------------------------------------------------------

import requests

from bs4 import BeautifulSoup

res=requests.get("https://comic.naver.com/webtoon/list?titleId=758037&weekday=mon")

soup=BeautifulSoup(res.text,"html.parser")

# 웹툰 제목, 작가이름, 작품설명, 웹툰 장르, 웹툰 연령

print(soup.select("h2"))

print(soup.select_one("h2>.title")) # 제목

print(soup.select_one(".wrt_nm")) # 작가이름

print(soup.select_one(".detail>p")) # 작품설명

print(soup.select_one(".genre")) # 웹툰장르

print(soup.select_one(".age")) # 연령

4. soup.select > 태그'들' > tag class 의 인스턴스들의 리스트

soup.select_one > 태그 > tag class 의 인스턴스

Tag Class 의 매서드/필드

text : 텍스트부분 추출

.text 하면 되지만 select는 리스트라서..

from bs4 import BeautifulSoup

import requests

st="""

</div>

NAVER로가기

</body>

</html>

"""

soup=BeautifulSoup(st,"html.parser")

print(soup.select_one("#hello1").text)

print(soup.select_one(".hello2").text)

print(soup.select_one(".d").text)

print(soup.select_one("body>span").text) #

-------------------------------------------------------------------------------------------------------------

# 데이터 전송 방식

1) GET : url 로 데이터 전송

detail?titleId=758037&no=20&weekday=mon

? : 데이터 전송의 시작

& : 데이터를 연결

# 미션

# 참교육 1화~90화까지

# 평점, 참여자수, 등록일 뽑기!

import requests

from bs4 import BeautifulSoup

f = open("참교육.csv", "w")

f.write("화수,평점,참여자수,게시된 날짜\n")

for i in range(1,91):

res = requests.get(f"https://comic.naver.com/webtoon/detail?titleId=758037&no={i}&weekday=mon")

soup = BeautifulSoup(res.text, "html.parser")

sub = soup.select_one(".view > h3").text

star = soup.select_one("#topPointTotalNumber > strong").text

peo = soup.select_one(".pointTotalPerson > em").text

d = soup.select_one(".date").text

f.write(f"{sub},{star},{peo},{d}\n")

# 과제

# champion code 는 1에서 161 까지 있습니다.

# - 161개의 캐릭터 한국이름, 영어이름, 캐릭터 설명, 특징 각각 따와주세요

import requests

from bs4 import BeautifulSoup

from tqdm import tqdm

for i in tqdm(range(1,162)):

res = requests.get(f"https://lol.inven.co.kr/dataninfo/champion/detail.php?code={i}")

soup = BeautifulSoup(res.text, "html.parser")

kor=soup.select_one(".korName").text

engn=soup.select_one(".engName").text

des=soup.select_one(".descText").text

png=soup.select_one(".png_bg").text

print(kor)

print(engn)

print(des)

print(png)

filename=kor.split(",")[0]

f=open(f"롤/{filename}.txt","w",encoding="utf-8")

f.write(f"{kor}\n{engn}\n{des}\n{png}\n")

# 2) POST

# HTML (뼈대)

# CSS (꾸며줌)

# Javascript (동적처리)

https://news.ycombinator.com/ 뉴스기사 600개 가져오기

힌트 : 사이트는 정형화 되어있어요. 하나만 가져오려다 여러개 가져와진다. 제목에 해당되는 셀렉터 있을 거예요 그걸로 모든 태그를 탐색한 뒤

텍스트 부분 출력해 주시면 되고, 600개는 하단의 more를 눌러보시면 같이 잡힐 부분.

import requests

from bs4 import BeautifulSoup

from tqdm import tqdm

for k in tqdm(range(1,21)):

res = requests.get(f"https://news.ycombinator.com/news?p={k}")

soup = BeautifulSoup(res.text, "html.parser")

for i in soup.select(".titlelink"):

print(i.text)

# https://news.ycombinator.com/ 뉴스기사 600개 가져오기

# 힌트 : 사이트는 정형화 되어있어요. 하나만 가져오려다 여러개 가져와진다. 제목에 해당되는 셀렉터 있을 거예요 그걸로 모든 태그를 탐색한 뒤

# 텍스트 부분 출력해 주시면 되고, 600개는 하단의 more를 눌러보시면 같이 잡힐 부분.

import requests

from bs4 import BeautifulSoup

from tqdm import tqdm

for k in tqdm(range(1,21)):

res = requests.get(f"https://news.ycombinator.com/news?p={k}")

soup = BeautifulSoup(res.text, "html.parser")

for i in soup.select(".titlelink"):

print(i.text)

# 사진 다운받기

import requests

# src의 속성값 따오면 된다.

r=requests.get("https://search.pstatic.net/common/?src=http%3A%2F%2Fblogfiles.naver.net%2F20151228_180%2Fminjiyeon236_14512674681733isvD_GIF%2Fanigif.gif&type=sc960_832_gif")

f=open("kimchi.gif","wb") # 바이너리 파일은 w 가 아니라 wb로 입력해줘야 한다.

f.write(r.content)

r=requests.get("https://search.pstatic.net/sunny/?src=http%3A%2F%2Ffile3.instiz.net%2Fdata%2Ffile3%2F2018%2F11%2F01%2F8%2F3%2Fd%2F83dcca52f6a1b1c067e6b62ff9d6d9a8.gif&type=sc960_832_gif")

f=open("chicken.gif","wb") # 바이너리 파일은 w 가 아니라 wb로 입력해줘야 한다.

f.write(r.content)

r=requests.get("http://file3.instiz.net/data/file3/2020/07/29/4/2/d/42dd9faae2a90f1f0773ac157c94e59f.gif")

f=open("chovy.gif","wb") # 바이너리 파일은 w 가 아니라 wb로 입력해줘야 한다.

f.write(r.content)

r=requests.get("https://mblogthumb-phinf.pstatic.net/MjAyMDAxMTZfNzAg/MDAxNTc5MTg1MzU5MjIz.1SXpiqJFwHDoh2Vk0xWJCvTf4tLcpAk6jllW9pF9Ek4g.sH5EmdfaKEYzG-M7D7-JmY67BgpZ1V-8VtP8mT1sUoUg.GIF.pinkpeach09/IMG_5820.GIF?type=w800")

f=open("kuromi.gif","wb") # 바이너리 파일은 w 가 아니라 wb로 입력해줘야 한다.

f.write(r.content)

-------------------------------------------------------------------------------------------------------------

파일

1. 텍스트 파일

2. 바이너리 파일

파일은 1010101 ..들로 이루어져 있는데 이걸 어떻게 해석할 것인가에 따라 텍스트와 파이너리로 나뉜다.

Tag 클래스의 매서드와 필드

1) .text : 텍스트 추출

2) .get(속성명) : 속성값 추출

3) .select : 마저 자르기

st="""

</div>

NAVER로가기

</body>

</html>

"""

import requests

from bs4 import BeautifulSoup

soup = BeautifulSoup(st, "html.parser")

print(soup.select_one("#hello1").get("here"))

print(soup.select_one(".hello2").get("here"))

print(soup.select_one(".d").get("here"))

print(soup.select_one("a").get("href"))

-------------------------------------------------------------------------------------------------------------

# 월요웹툰 썸네일 전체를 "웹툰"이라는 폴더 안에 저장되도록 이미지 크롤링 해보세요.

# - 파일 이름은 웹툰이름으로 할 거예요!

# 신의탑.png 처럼요

# hint0 : 파일이름이랑 파일 경로 둘다 필요한 상황이다!

# hint1 : 어 참교육 하나만 따오려고 했는데 다 따와지네?

# hint2 : title, src 속성값 두 개를 따오자.

import requests

from bs4 import BeautifulSoup

왜 제왕이랑 메디컬급식은 안 될까? 이유 : 콜론이 있어서

def 걸러(st):

for i in "/\\\"<>?!*:":

st=st.replace(i,"")

return st

res=requests.get("https://comic.naver.com/webtoon/weekdayList?week=mon")

soup = BeautifulSoup(res.text, "html.parser")

for i in soup.select(".thumb>a>img"):

경로=i.get("src") # 웹툰 경로

파일이름=i.get("title")

r=requests.get(경로)

f=open(f"웹툰/{걸러(파일이름)}.png","wb")

f.write(r.content)

import os

import requests

from bs4 import BeautifulSoup

def 폴더생성(st):

if os.path.isdir(st):

pass

else:

os.mkdir(st) # 제주라는 폴더를 만드는데, 해당 제목의 폴더가 이미 존재하면 오류가 뜬다.

def 걸러(st):

for i in "/\\\"<>?!*:":

st=st.replace(i,"")

return st

li=["mon","tue","wed","thu","fri","sat","sun"]

폴더생성("웹툰")

for i in li:

폴더생성(f"웹툰/{i}")

res=requests.get(f"https://comic.naver.com/webtoon/weekdayList?week={i}")

soup = BeautifulSoup(res.text, "html.parser")

for j in soup.select(".thumb>a>img"):

경로=j.get("src") # 웹툰 경로

파일이름=j.get("title")

r=requests.get(경로)

f=open(f"웹툰/{i}/{걸러(파일이름)}.png","wb")

f.write(r.content)

-------------------------------------------------------------------------------------------------------------

# 여러 개의 태그를 가져올 때? (.select 참조)

from bs4 import BeautifulSoup

import os

import requests

st = """<html>

<body>

<span class="name">KENNEN</span>

</div>

<span class="name">TEEMO</span>

</div>

<span class="name">VEIGAR</span>

</div>

</body>

</html>"""

# 롤 챔피언 이미지랑 이름 따오는 거

import os

import requests

from bs4 import BeautifulSoup

def 폴더생성(st):

if os.path.isdir(st):

pass

else:

os.mkdir(st) # 제주라는 폴더를 만드는데, 해당 제목의 폴더가 이미 존재하면 오류가 뜬다.

def 걸러(st):

for i in "/\\\"<>?!*:":

st=st.replace(i,"")

return st

soup = BeautifulSoup(st, "html.parser")

# for i in soup.select("#champ"):

# print(i.select_one(".name"))

# print(i.select_one(".hp"))

# print(i.select_one(".mp"))

res=requests.get("https://lol.inven.co.kr/dataninfo/champion")

soup = BeautifulSoup(res.text, "html.parser")

폴더생성("롤 아이콘")

for i in soup.select(".champImage"):

챔피언=i.select_one("a").get("title")

경로="https:"+i.select_one("img").get("src") # https 꼭 넣어줘야 한다

r=requests.get(경로)

f=open(f"롤 아이콘/{걸러(챔피언)}.png","wb")

f.write(r.content)

728x90

저작자표시 변경금지 (새창열림)