搞笑视频爬虫
import json
import hashlib
import time
import requests
import re
import threading
from pymongo import MongoClient
from qiniu import Auth, put_data
import multiprocessing
import sys
sys.path.insert(0, '/data/MyBlog')
from MyBlog.settings import MEDIA_URL
from MyBlog.utils import redis_client
from constants import MONGO_URI, MONGODB_NAME, QINIU_ACCESS_KEY, QINIU_SECRET_KEY, QINIU_BUCKET_NAME
class QiNiu(object):
def __init__(self):
self.user = Auth(QINIU_ACCESS_KEY, QINIU_SECRET_KEY)
self.bucket_name = QINIU_BUCKET_NAME
def up_stream(self, stream, key):
token = self.user.upload_token(self.bucket_name, key, 3600)
return put_data(token, key, stream, progress_handler=True)
def get_mongo():
return MongoClient(MONGO_URI, connect=False)[MONGODB_NAME]
mongodb = get_mongo()
qiniu_client = QiNiu()
class VideoSpider(DuanziSpider):
def __init__(self, *args, **kwargs):
DuanziSpider.__init__(self, *args, **kwargs)
# self.base_url = 'http://gaoxiao.52op.net/egao/index.htm'
# self.base_url = 'http://gaoxiao.52op.net/fangyan/'
self.base_url = 'http://gaoxiao.52op.net/egao/'
self.json_url = 'http://gaoxiao.52op.net/flvData/d.aspx?id={}'
def get_media_list(self):
res = requests.get(self.base_url, headers=self.headers)
res.encoding = 'utf-8'
total_pages = re.findall('<a class="linkPage" href="(.*?)">([\S\s]*?)</a>', res.text)
self.base_url = [self.base_url] + [t[0] for t in total_pages]
# print(res.text)
all_url = set()
for p in self.base_url:
ret = requests.get(url=p, headers=self.headers)
ret.encoding = 'utf-8'
htmls = re.findall('<a href="(.*)">([\S\s]*?)</a>', ret.text)
for html in htmls:
if html[0].endswith('target="_blank') and 'htm' in html[0]:
_id = html[0].split(' ')[0].split('/')[-1].split('.')[0]
all_url.add(self.json_url.format(_id))
return all_url
def down_and_upload_media(self, url):
print(url)
res = requests.get(url=url, headers=self.headers)
res.encoding = 'utf-8'
if res.status_code not in (200, 201):
return
data = json.loads(res.text)
Data = data.get('Data')
if not Data:
return
mp4_url = Data.get('MP4')
if not mp4_url:
return
mp4_url = mp4_url.encode('utf-8').decode()
res = requests.get(mp4_url, self.headers)
if res.status_code not in (200, 201):
return
m = hashlib.md5()
m.update(res.content)
uid = m.hexdigest()
if self.mongodb['video'].find_one({'uid': uid}):
print(self.mongodb['video'].find_one({'uid': uid}))
return
r, info = qiniu_client.up_stream(res.content, 'video/' + uid + '.mp4')
title_desc = data.get('Name')
if ' ' in title_desc:
title_desc = title_desc.split(' ')
title = title_desc[0]
desc = title_desc[1]
else:
title = desc = title_desc
if r.get('key'):
to_write = {
'id': self.get_inc_id(),
'uid': uid,
'src': MEDIA_URL + r.get('key'),
'title': title,
'describe': desc
}
print('write data {}'.format(to_write))
self.mongodb['video'].insert(to_write)
@staticmethod
def get_inc_id():
return redis_client.incrby('video_id', 1)
def run(self):
url = list(self.get_media_list())
pool = multiprocessing.Pool(multiprocessing.cpu_count())
print(len(url))
for a in url:
# pool.apply_async(self.down_and_upload_media, args=(a,))
self.down_and_upload_media(a)
# pool.close()
# pool.join()
video_spider = VideoSpider()
video_spider.run()
瞎玩玩,简单爬
links: https://www.mongona.com/?c=6