黑马程序员技术交流社区

标题: 【郑州校区】抓取自己的免费服务器 [打印本页]

作者: 我是楠楠 时间: 2017-8-18 13:45
标题: 【郑州校区】抓取自己的免费服务器

抓取自己的免费服务器

▇ 文/郑州中心就业服务部马老师

写这篇文章的初衷是要为自己提供免费的科(fan)学(qiang)上网服务器，方法简单粗暴，而且行之有效。本文尝试使用py语言抓取，会shell 的请自行略过，我只是不想讨论哪个是最好的语言。

写文章的时候免费服务器已经关闭，所以用了其他图片进行替换,敬请谅解。

原理：在 https://www.cellsystech.net/docs/ 抓取二维码图片地址，二维码要进行解析，参考了google zxing在线二维码解析方案http://zxing.org/w/decode.jspx ，解析后发现完全看不懂，发现解析后的数据是base64加密后的，遂用base64解密，得到服务器地址/端口号/加密方式等信息。

本次环境为py 2.7，请求采用urllibs2，采用3.x的可以优先考虑requests。

一、记录要抓取的地址

mdtz_doc_url = 'https://www.cellsystech.net/docs/'

zxing_url = 'http://zxing.org/w/decode?u='

二、请求要抓取的网页源码

source_code = urllib2.urlopen(self.mdtz_doc_url).read()

三、获取返回的源码后进行转移，便于解析到对应的二维码地址

soup = BeautifulSoup(source_code)

for each_img_element in soup.findAll('img'):

#print 'got an img element', each_img_element.get('src')

self.imgUrl = each_img_element.get('src')

四、拿到二维码地址后进行解析 http://zxing.org/w/decode.jspx

zxing_ana_url = self.zxing_url +self.imgUrl

print zxing_ana_url

print '------------------------------zxing parsing---------------------------'

parsed_code=urllib2.urlopen(zxing_ana_url).read()

五、拿到解析结果后第一步要截取字符串前边的ss:// 不需要

六、Base64解码

base64.decodestring(base64_code[5:])

得到解码结果：

chacha20:NIdewq-d3+qdH@usc.cellsystech.net:465

chacha20是加密方式

usc.cellsystech.net 是服务器地址

465 是端口号

NIdewq-d3+qdH 是密码

七、配置ss的方法就比较简单了，这里不做过多再介绍。

八、完整代码实例：

import base64

import urllib2

from bs4 import BeautifulSoup

import time

class MDTZ(object):

mdtz_doc_url = 'https://www.cellsystech.net/docs/'

zxing_url = 'http://zxing.org/w/decode?u='

def __init__(self):

#self.imgUrl = qercodeImgUrl

self.parse_code_attr = []

#self.start_ercode_spider(mdtz_doc_url)

def start_ercode_spider(self):

print '-------------------starting ercode_spider-----------------\n'

try:

source_code = urllib2.urlopen(self.mdtz_doc_url).read()

soup = BeautifulSoup(source_code)

for each_img_element in soup.findAll('img'):

#print 'got an img element', each_img_element.get('src')

self.imgUrl = each_img_element.get('src')

except:

pass

def startSpider(self):

try:

self.start_ercode_spider()

#source_code = urllib2.urlopen(self.imgUrl)

zxing_ana_url = self.zxing_url +self.imgUrl

print zxing_ana_url

print '------------------------------zxing parsing---------------------------'

parsed_code = urllib2.urlopen(zxing_ana_url).read()

print 'parsed success \n'

soup = BeautifulSoup(parsed_code,'html.parser')

for pre_element in soup.findAll('pre'):

#if pre_element.contains('ss://'):

#print 'base64 encoding code >>' +pre_element.text

self.parse_code_attr.append(pre_element.text)

base64_decode_code = self.parse_base64_code(self.parse_code_attr[0])

self.write_json_config_file(base64_decode_code)

self.parse_base64_decoding(base64_decode_code)

return base64_decode_code

except:

pass

finally:

pass

#parse base64

def parse_base64_code(self,base64_code):

print '--------------------------------base64 decoding-----------------------\n'

print base64_code[5:],'\n'

return base64.decodestring(base64_code[5:])

# write json code to config.json

def write_json_config_file(self,config_code):

fw = open('config.json','wb+')

fw.write(config_code)

fw.close()

def parse_base64_decoding(self,base64_decoding_code):

print '--------------------base64_decoding_code_parse---------\n'

encoding_method = base64_decoding_code.split(':')[0]

encoding_password = base64_decoding_code[base64_decoding_code.index(':')+1:base64_decoding_code.index('@')]

encoding_server = base64_decoding_code[base64_decoding_code.index('@')+1:]

encoding_server_ip = encoding_server.split(':')[0]

encoding_server_port = encoding_server.split(':')[1]

print 'method ',encoding_method

print 'passoword ', encoding_password

print 'server ',encoding_server_ip

print 'port ',encoding_server_port

if __name__ == "__main__":

# download ...

print 'img downloading....'

mao = MDTZ()

spider_times = 1

runing = True

while runing:

#sys_in = raw_input('>>>>>enter q exit<<<<<\n')

#if sys_in == 'q':

# runing = False

# break

print 'spider starting at ',str(spider_times)

mao.startSpider()

print 'spider done at ',str(spider_times),'\n'

#time.sleep(20)

spider_times = spider_times + 1

欢迎光临黑马程序员技术交流社区 (http://bbs.itheima.com/)

黑马程序员IT技术论坛 X3.2