This commit is contained in:
dosang.yoon
2022-06-03 15:29:49 +09:00
parent 7839146545
commit 7ded335f71
6 changed files with 148 additions and 25 deletions

View File

@@ -2,6 +2,7 @@ import json
import datetime
import requests
import pandas as pd
from time import sleep
import os
class MetaCrawler:
@@ -46,8 +47,17 @@ class MetaCrawler:
finish = False
for i in range(1, self.limit_page_count):
#html = pd.read_html(input['URL'] + '&page=%s' % i, header=0)
html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text)
sleep(0.5)
html = None
while True:
try:
html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text)
sleep(0.5)
break
except:
print(i)
if i > 200:
break
continue
# 마지막 페이지 까지 받기
if len(html[0].날짜.values) <= 1:
@@ -120,8 +130,17 @@ class MetaCrawler:
finish = False
for i in range(1, self.limit_page_count):
#html = pd.read_html(url + str(i), header=0)
html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text)
sleep(0.5)
html = None
while True:
try:
html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text)
sleep(0.5)
break
except:
print(i)
if i > 200:
break
continue
# 마지막 페이지 까지 받기
if len(html[0].날짜.values) <= 2:
@@ -181,8 +200,17 @@ class MetaCrawler:
finish = False
for i in range(1, self.limit_page_count):
#html = pd.read_html(url + str(i), header=0, encoding='euc-kr')
html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text, encoding='euc-kr')
sleep(0.5)
html = None
while True:
try:
html = pd.read_html(requests.get(url + str(i), headers=self.header, timeout=30).text, encoding='euc-kr')
sleep(0.5)
break
except:
print(url + str(i))
if i > 200:
break
continue
# 마지막 페이지 까지 받기
if len(html[0].날짜.values) <= 10:
@@ -253,8 +281,17 @@ class MetaCrawler:
finish = False
for i in range(1, self.limit_page_count):
#html = pd.read_html(input['URL'] + '&page=%s' % i, header=0)
html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text)
sleep(0.5)
html = None
while True:
try:
html = pd.read_html(requests.get(input['URL'] + '&page=%s' % i, headers=self.header, timeout=30).text)
sleep(0.5)
break
except:
print(i)
if i > 200:
break
continue
# 마지막 페이지 까지 받기
if len(html[0].날짜.values) <= 1:

View File

@@ -5,6 +5,7 @@ import re
import json
import os
import requests
from time import sleep
class Queue(object):
def __init__(self, max):
@@ -255,8 +256,17 @@ class StockCrawler:
# 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다.
pg_url = '{url}&page={page}'.format(url=url, page=page)
#html = pd.read_html(pg_url, header=0)
html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text)
sleep(0.5)
html = None
while True:
try:
html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text)
sleep(0.5)
break
except:
print(pg_url)
if page > 200:
break
continue
count = 0
for date in html[0].날짜.values:
@@ -379,8 +389,17 @@ class StockCrawler:
# 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다.
pg_url = '{url}&page={page}'.format(url=url, page=page)
#html = pd.read_html(pg_url, header=0)
html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text)
sleep(0.5)
html = None
while True:
try:
html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text)
sleep(0.5)
break
except:
print(pg_url)
if page > 200:
break
continue
count = 0
for date in html[0].날짜.values:
@@ -603,8 +622,17 @@ class StockCrawler:
# 최근 상장 기업의 마지막 반복되는 페이지를 제외시킨다.
pg_url = '{url}&page={page}'.format(url=url, page=page)
#html = pd.read_html(pg_url, header=0)
html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text)
sleep(0.5)
html = None
while True:
try:
html = pd.read_html(requests.get(pg_url, headers=self.header, timeout=30).text)
sleep(0.5)
break
except:
print(pg_url)
if page > 200:
break
continue
count = 0
for date in html[0].날짜.values: