Python网站采集功能,多线程的采集、WDPYSPIDER类、pycurl

Python

1importurllib
2urlItem=urllib.urlopen("http://www.baidu.com")
3htmSource=urlItem.read()
4urlItem.close()
5printhtmSource

pycurl

http://pycurl.sourceforge.net/download/

http://pycurl.sourceforge.net/doc/curlobject.html

Python

01importpycurl
02c=pycurl.Curl()
03c.setopt(pycurl.URL,"http://www.whiledo.com/")
04c.setopt(pycurl.HTTPHEADER, ["Accept:"])
05importStringIO
06b=StringIO.StringIO()
07c.setopt(pycurl.WRITEFUNCTION, b.write)
08c.setopt(pycurl.FOLLOWLOCATION,1)
09c.setopt(pycurl.MAXREDIRS,5)
10c.perform()
11printb.getvalue()
12printc.getinfo(pycurl.INFO_FILETIME)

curl_easy_setopt

告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向(默认)
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下,libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELISTCURLOPT_HTTPGETCURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证(用户+密码)
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小(以字节为单位)
(认证)
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库,要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:
  • CURLAUTH_BASIC: HTTP基本验证
  • CURLAUTH_DIGEST: HTTP摘要身份验证
  • CURLAUTH_DIGEST_IE:
  • CURLAUTH_GSSNEGOTIATE: Kerberos5认证 要建立GSS - API
  • CURLAUTH_NTLM: NTLM身份验证
  • CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
  • CURLAUTH_ANYSAFE: 设置基本选项....
  • CURLAUTH_ONLY: 强制所有请求使用验证

getinfo

CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE
CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL
CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码
CURLINFO_FILETIME:
CURLINFO_TOTAL_TIME:
CURLINFO_CONNECT_TIME:
CURLINFO_NUM_CONNECTS: 多少个连接
CURLINFO_CONTENT_TYPE: 例:text/html
CURLINFO_REQUEST_SIZE:
CURLINFO_HEADER_SIZE:
CURLINFO_SIZE_DOWNLOAD: 下载总字节量
CURLINFO_SIZE_UPLOAD:
CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证
CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证
CURLINFO_COOKIELIST:
部份使用 INFO_ 如:INFO_COOKIELIST

一个粗糙的共用对象的采集示例

Python

01importpycurl
02importStringIO
03importstring
04importrandom
05classspider:
06def__init__(self,addHeader=[]):
07self.httpheader=[
08'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
09#,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
10]+addHeader
11self.curl=pycurl.Curl()
12self.curl.setopt(pycurl.HTTPHEADER,self.httpheader)
13self.curl.setopt(pycurl.REFERER,'http://www.google.com/search?source+self.rand_str())
14#self.curl.setopt(pycurl.AUTOREFERER, 1)
15self.curl.setopt(pycurl.FOLLOWLOCATION,1)
16self.curl.setopt(pycurl.MAXREDIRS,5)
17
18def__del__(self):
19pass
20
21defrand_str(self):
22return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
23
24deftofile(self,url,filename):
25fp=open(filename,'w');
26self.curl.setopt(pycurl.URL, url)
27self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)
28self.curl.perform()
29fp.close()
30returnTrue
31
32defhtml(self, url):
33sio=StringIO.StringIO()
34self.curl.setopt(pycurl.URL, url)
35self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)
36self.curl.perform()
37reval=sio.getvalue()
38sio.close()
39returnreval
40
41if__name__=="__main__":
42get=spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])
43printget.html("http://localhost/spider_for_test.php")
44printget.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')

一个多线程的采集示例

Python

01importpycurl
02importthreading
03importStringIO
04importstring
05importrandom
06classspider:
07def__init__(self,referer='',httpheader=[]):
08self.httpheader=[
09'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
10,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
11]+httpheader
12self.referer=referer
13def__del__(self):
14pass
15
16deffetch(self,url,stream):
17curl=pycurl.Curl()
18curl.setopt(pycurl.HTTPHEADER,self.httpheader)
19ifself.referer=='':
20curl.setopt(pycurl.AUTOREFERER,1)
21else:
22curl.setopt(pycurl.REFERER,self.referer)
23curl.setopt(pycurl.FOLLOWLOCATION,1)
24curl.setopt(pycurl.MAXREDIRS,5)
25curl.setopt(pycurl.URL, url)
26curl.setopt(pycurl.WRITEFUNCTION, stream.write)
27curl.perform()
28curl.close()
29
30defrand_str(self):
31return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
32
33deftofile(self,url,filename):
34fp=open(filename,'w');
35self.fetch(url,fp)
36fp.close()
37returnTrue
38
39defhtml(self, url):
40sio=StringIO.StringIO()
41self.fetch(url,sio)
42reval=sio.getvalue()
43sio.close()
44returnreval
45
46defgethtml(url,get):
47printget.html(url)
48
49if__name__=="__main__":
50importtime,datetime
51dstart=datetime.datetime.now()
52get=spider()
53get.referer='http://www.google.com/search?source+get.rand_str()
54thread_pool=[]
55acc=Account(100)
56foriinrange(10):
57url="http://localhost/test.php?n="+str(i)
58th=threading.Thread(target=gethtml,args=(url,get))
59thread_pool.append(th)
60foriinrange(10):
61thread_pool[i].start()
62foriinrange(10):
63threading.Thread.join(thread_pool[i])
64dend=datetime.datetime.now()
65print"Time span:", dend-dstart;

WDPYSPIDER类(支持,多线程,代理,登陆验证,POST)

Python

001#coding:utf-8
002importpycurl
003importurllib
004importthreading
005importStringIO
006importstring
007importrandom
008classspider:
009'''WDPYSPIDER(Whiledo Python Spider Class) 采集类
010
011@author HzqGhost admin@whiledo.com QQ:313143468
012get = spider()
013get.referer = 'http://www.google.com/search?source+get.rand_str()
014get.proxyuse = True
015get.proxyip = ['059148233056.ctinets.com:80']
016url = "http://www.whiledo.com"
017print get.html(url=url)'''
018def__init__(self):
019self.httpheader=[
020'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
021,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
022]#http头信息
023self.referer=''#伪造来源路径
024self.connnecttimeout=60#获取联接超时(秒)
025self.timeout=300#读定超时(秒)
026self.backheader=0#是否返回服务器http头信息(一般用于测试)
027self.cookesfile="./cookes.dat"#cookesfile 自动读写处理文件
028self.proxyuse=False#是否使用代理服务器
029self.proxyip=[]#代理服务器[IP:PORT]列表,随机使用列表中的IP
030self.proxynodomain=['localhost','127.0.0.1']#不使用代理服务器的域
031self.http200alias=[]#200返回信息别名列表
032self.error='WDPYERROR'#非200状态时返回的错误标识
033def__del__(self):
034pass
035
036deffetch(self,url,stream, post={}):
037'''
038--url
039--stream [stream] StringIO or fp
040--post [dict] {'username':'hzq','password':'blog'}'''
041curl=pycurl.Curl()
042curl.setopt(pycurl.CONNECTTIMEOUT,self.connnecttimeout)
043curl.setopt(pycurl.TIMEOUT,self.timeout)
044curl.setopt(pycurl.HTTPHEADER,self.httpheader)
045curl.setopt(pycurl.HTTP200ALIASES,self.http200alias)
046curl.setopt(pycurl.HEADER,self.backheader)
047curl.setopt(pycurl.FOLLOWLOCATION,1)
048curl.setopt(pycurl.MAXREDIRS,5)
049ifself.referer=='':
050curl.setopt(pycurl.AUTOREFERER,1)
051else:
052curl.setopt(pycurl.REFERER,self.referer)
053curl.setopt(pycurl.COOKIEJAR,self.cookesfile)
054curl.setopt(pycurl.COOKIEFILE,self.cookesfile)
055curl.setopt(pycurl.WRITEFUNCTION, stream.write)
056curl.setopt(pycurl.URL, url)
057ifself.proxyuse:
058proxyip=self.proxyip[random.randint(0,len(self.proxyip)-1)];
059curl.setopt(pycurl.PROXY, proxyip)
060#curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本
061iflen(post)>0:
062curl.setopt(pycurl.POSTFIELDS, post)
063status=''
064try:
065curl.perform()
066status=curl.getinfo(pycurl.RESPONSE_CODE)
067except:
068status=curl.errstr()
069finally:
070curl.close()
071status=str(status);
072ifstatus !='200':
073status=self.error
074returnstatus;
075
076defrand_str(self):
077return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
078
079deftofile(self,url,filename, post={}):
080fp=open(filename,'wb');
081self.fetch(url,fp,post)
082fp.close()
083returnTrue
084
085defhtml(self, url, post={}):
086sio=StringIO.StringIO()
087reval=self.fetch(url,sio, post)
088ifreval=='200':
089reval=sio.getvalue()
090sio.close()
091returnreval
092
093defgethtml(url,get):
094printget.html(url)
095
096if__name__=="__main__":
097get=spider()
098get.referer='http://www.google.com/search?source+get.rand_str()
099get.proxyuse=True
100get.proxyip=['059148233056.ctinets.com:80']
101url="http://www.whiledo.com"
102printget.html(url=url)

http://www.cnblogs.com/huangcong/

本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。

Python

1importurllib
2urlItem=urllib.urlopen("http://www.baidu.com")
3htmSource=urlItem.read()
4urlItem.close()
5printhtmSource

pycurl

http://pycurl.sourceforge.net/download/

http://pycurl.sourceforge.net/doc/curlobject.html

Python

01importpycurl
02c=pycurl.Curl()
03c.setopt(pycurl.URL,"http://www.whiledo.com/")
04c.setopt(pycurl.HTTPHEADER, ["Accept:"])
05importStringIO
06b=StringIO.StringIO()
07c.setopt(pycurl.WRITEFUNCTION, b.write)
08c.setopt(pycurl.FOLLOWLOCATION,1)
09c.setopt(pycurl.MAXREDIRS,5)
10c.perform()
11printb.getvalue()
12printc.getinfo(pycurl.INFO_FILETIME)

curl_easy_setopt

告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向(默认)
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下,libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELISTCURLOPT_HTTPGETCURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证(用户+密码)
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小(以字节为单位)
(认证)
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库,要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:
  • CURLAUTH_BASIC: HTTP基本验证
  • CURLAUTH_DIGEST: HTTP摘要身份验证
  • CURLAUTH_DIGEST_IE:
  • CURLAUTH_GSSNEGOTIATE: Kerberos5认证 要建立GSS - API
  • CURLAUTH_NTLM: NTLM身份验证
  • CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
  • CURLAUTH_ANYSAFE: 设置基本选项....
  • CURLAUTH_ONLY: 强制所有请求使用验证

getinfo

CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE
CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL
CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码
CURLINFO_FILETIME:
CURLINFO_TOTAL_TIME:
CURLINFO_CONNECT_TIME:
CURLINFO_NUM_CONNECTS: 多少个连接
CURLINFO_CONTENT_TYPE: 例:text/html
CURLINFO_REQUEST_SIZE:
CURLINFO_HEADER_SIZE:
CURLINFO_SIZE_DOWNLOAD: 下载总字节量
CURLINFO_SIZE_UPLOAD:
CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证
CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证
CURLINFO_COOKIELIST:
部份使用 INFO_ 如:INFO_COOKIELIST

一个粗糙的共用对象的采集示例

Python

01importpycurl
02importStringIO
03importstring
04importrandom
05classspider:
06def__init__(self,addHeader=[]):
07self.httpheader=[
08'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
09#,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
10]+addHeader
11self.curl=pycurl.Curl()
12self.curl.setopt(pycurl.HTTPHEADER,self.httpheader)
13self.curl.setopt(pycurl.REFERER,'http://www.google.com/search?source+self.rand_str())
14#self.curl.setopt(pycurl.AUTOREFERER, 1)
15self.curl.setopt(pycurl.FOLLOWLOCATION,1)
16self.curl.setopt(pycurl.MAXREDIRS,5)
17
18def__del__(self):
19pass
20
21defrand_str(self):
22return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
23
24deftofile(self,url,filename):
25fp=open(filename,'w');
26self.curl.setopt(pycurl.URL, url)
27self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)
28self.curl.perform()
29fp.close()
30returnTrue
31
32defhtml(self, url):
33sio=StringIO.StringIO()
34self.curl.setopt(pycurl.URL, url)
35self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)
36self.curl.perform()
37reval=sio.getvalue()
38sio.close()
39returnreval
40
41if__name__=="__main__":
42get=spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])
43printget.html("http://localhost/spider_for_test.php")
44printget.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')

一个多线程的采集示例

Python

01importpycurl
02importthreading
03importStringIO
04importstring
05importrandom
06classspider:
07def__init__(self,referer='',httpheader=[]):
08self.httpheader=[
09'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
10,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
11]+httpheader
12self.referer=referer
13def__del__(self):
14pass
15
16deffetch(self,url,stream):
17curl=pycurl.Curl()
18curl.setopt(pycurl.HTTPHEADER,self.httpheader)
19ifself.referer=='':
20curl.setopt(pycurl.AUTOREFERER,1)
21else:
22curl.setopt(pycurl.REFERER,self.referer)
23curl.setopt(pycurl.FOLLOWLOCATION,1)
24curl.setopt(pycurl.MAXREDIRS,5)
25curl.setopt(pycurl.URL, url)
26curl.setopt(pycurl.WRITEFUNCTION, stream.write)
27curl.perform()
28curl.close()
29
30defrand_str(self):
31return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
32
33deftofile(self,url,filename):
34fp=open(filename,'w');
35self.fetch(url,fp)
36fp.close()
37returnTrue
38
39defhtml(self, url):
40sio=StringIO.StringIO()
41self.fetch(url,sio)
42reval=sio.getvalue()
43sio.close()
44returnreval
45
46defgethtml(url,get):
47printget.html(url)
48
49if__name__=="__main__":
50importtime,datetime
51dstart=datetime.datetime.now()
52get=spider()
53get.referer='http://www.google.com/search?source+get.rand_str()
54thread_pool=[]
55acc=Account(100)
56foriinrange(10):
57url="http://localhost/test.php?n="+str(i)
58th=threading.Thread(target=gethtml,args=(url,get))
59thread_pool.append(th)
60foriinrange(10):
61thread_pool[i].start()
62foriinrange(10):
63threading.Thread.join(thread_pool[i])
64dend=datetime.datetime.now()
65print"Time span:", dend-dstart;

WDPYSPIDER类(支持,多线程,代理,登陆验证,POST)

Python

001#coding:utf-8
002importpycurl
003importurllib
004importthreading
005importStringIO
006importstring
007importrandom
008classspider:
009'''WDPYSPIDER(Whiledo Python Spider Class) 采集类
010
011@author HzqGhost admin@whiledo.com QQ:313143468
012get = spider()
013get.referer = 'http://www.google.com/search?source+get.rand_str()
014get.proxyuse = True
015get.proxyip = ['059148233056.ctinets.com:80']
016url = "http://www.whiledo.com"
017print get.html(url=url)'''
018def__init__(self):
019self.httpheader=[
020'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
021,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
022]#http头信息
023self.referer=''#伪造来源路径
024self.connnecttimeout=60#获取联接超时(秒)
025self.timeout=300#读定超时(秒)
026self.backheader=0#是否返回服务器http头信息(一般用于测试)
027self.cookesfile="./cookes.dat"#cookesfile 自动读写处理文件
028self.proxyuse=False#是否使用代理服务器
029self.proxyip=[]#代理服务器[IP:PORT]列表,随机使用列表中的IP
030self.proxynodomain=['localhost','127.0.0.1']#不使用代理服务器的域
031self.http200alias=[]#200返回信息别名列表
032self.error='WDPYERROR'#非200状态时返回的错误标识
033def__del__(self):
034pass
035
036deffetch(self,url,stream, post={}):
037'''
038--url
039--stream [stream] StringIO or fp
040--post [dict] {'username':'hzq','password':'blog'}'''
041curl=pycurl.Curl()
042curl.setopt(pycurl.CONNECTTIMEOUT,self.connnecttimeout)
043curl.setopt(pycurl.TIMEOUT,self.timeout)
044curl.setopt(pycurl.HTTPHEADER,self.httpheader)
045curl.setopt(pycurl.HTTP200ALIASES,self.http200alias)
046curl.setopt(pycurl.HEADER,self.backheader)
047curl.setopt(pycurl.FOLLOWLOCATION,1)
048curl.setopt(pycurl.MAXREDIRS,5)
049ifself.referer=='':
050curl.setopt(pycurl.AUTOREFERER,1)
051else:
052curl.setopt(pycurl.REFERER,self.referer)
053curl.setopt(pycurl.COOKIEJAR,self.cookesfile)
054curl.setopt(pycurl.COOKIEFILE,self.cookesfile)
055curl.setopt(pycurl.WRITEFUNCTION, stream.write)
056curl.setopt(pycurl.URL, url)
057ifself.proxyuse:
058proxyip=self.proxyip[random.randint(0,len(self.proxyip)-1)];
059curl.setopt(pycurl.PROXY, proxyip)
060#curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本
061iflen(post)>0:
062curl.setopt(pycurl.POSTFIELDS, post)
063status=''
064try:
065curl.perform()
066status=curl.getinfo(pycurl.RESPONSE_CODE)
067except:
068status=curl.errstr()
069finally:
070curl.close()
071status=str(status);
072ifstatus !='200':
073status=self.error
074returnstatus;
075
076defrand_str(self):
077return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
078
079deftofile(self,url,filename, post={}):
080fp=open(filename,'wb');
081self.fetch(url,fp,post)
082fp.close()
083returnTrue
084
085defhtml(self, url, post={}):
086sio=StringIO.StringIO()
087reval=self.fetch(url,sio, post)
088ifreval=='200':
089reval=sio.getvalue()
090sio.close()
091returnreval
092
093defgethtml(url,get):
094printget.html(url)
095
096if__name__=="__main__":
097get=spider()
098get.referer='http://www.google.com/search?source+get.rand_str()
099get.proxyuse=True
100get.proxyip=['059148233056.ctinets.com:80']
101url="http://www.whiledo.com"
102printget.html(url=url)

Python

1importurllib
2urlItem=urllib.urlopen("http://www.baidu.com")
3htmSource=urlItem.read()
4urlItem.close()
5printhtmSource

pycurl

http://pycurl.sourceforge.net/download/

http://pycurl.sourceforge.net/doc/curlobject.html

Python

01importpycurl
02c=pycurl.Curl()
03c.setopt(pycurl.URL,"http://www.whiledo.com/")
04c.setopt(pycurl.HTTPHEADER, ["Accept:"])
05importStringIO
06b=StringIO.StringIO()
07c.setopt(pycurl.WRITEFUNCTION, b.write)
08c.setopt(pycurl.FOLLOWLOCATION,1)
09c.setopt(pycurl.MAXREDIRS,5)
10c.perform()
11printb.getvalue()
12printc.getinfo(pycurl.INFO_FILETIME)

curl_easy_setopt

告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向(默认)
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下,libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELISTCURLOPT_HTTPGETCURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证(用户+密码)
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小(以字节为单位)
(认证)
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库,要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:
  • CURLAUTH_BASIC: HTTP基本验证
  • CURLAUTH_DIGEST: HTTP摘要身份验证
  • CURLAUTH_DIGEST_IE:
  • CURLAUTH_GSSNEGOTIATE: Kerberos5认证 要建立GSS - API
  • CURLAUTH_NTLM: NTLM身份验证
  • CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
  • CURLAUTH_ANYSAFE: 设置基本选项....
  • CURLAUTH_ONLY: 强制所有请求使用验证

getinfo

CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE
CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL
CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码
CURLINFO_FILETIME:
CURLINFO_TOTAL_TIME:
CURLINFO_CONNECT_TIME:
CURLINFO_NUM_CONNECTS: 多少个连接
CURLINFO_CONTENT_TYPE: 例:text/html
CURLINFO_REQUEST_SIZE:
CURLINFO_HEADER_SIZE:
CURLINFO_SIZE_DOWNLOAD: 下载总字节量
CURLINFO_SIZE_UPLOAD:
CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证
CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证
CURLINFO_COOKIELIST:
部份使用 INFO_ 如:INFO_COOKIELIST

一个粗糙的共用对象的采集示例

Python

01importpycurl
02importStringIO
03importstring
04importrandom
05classspider:
06def__init__(self,addHeader=[]):
07self.httpheader=[
08'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
09#,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
10]+addHeader
11self.curl=pycurl.Curl()
12self.curl.setopt(pycurl.HTTPHEADER,self.httpheader)
13self.curl.setopt(pycurl.REFERER,'http://www.google.com/search?source+self.rand_str())
14#self.curl.setopt(pycurl.AUTOREFERER, 1)
15self.curl.setopt(pycurl.FOLLOWLOCATION,1)
16self.curl.setopt(pycurl.MAXREDIRS,5)
17
18def__del__(self):
19pass
20
21defrand_str(self):
22return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
23
24deftofile(self,url,filename):
25fp=open(filename,'w');
26self.curl.setopt(pycurl.URL, url)
27self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)
28self.curl.perform()
29fp.close()
30returnTrue
31
32defhtml(self, url):
33sio=StringIO.StringIO()
34self.curl.setopt(pycurl.URL, url)
35self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)
36self.curl.perform()
37reval=sio.getvalue()
38sio.close()
39returnreval
40
41if__name__=="__main__":
42get=spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])
43printget.html("http://localhost/spider_for_test.php")
44printget.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')

一个多线程的采集示例

Python

01importpycurl
02importthreading
03importStringIO
04importstring
05importrandom
06classspider:
07def__init__(self,referer='',httpheader=[]):
08self.httpheader=[
09'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
10,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
11]+httpheader
12self.referer=referer
13def__del__(self):
14pass
15
16deffetch(self,url,stream):
17curl=pycurl.Curl()
18curl.setopt(pycurl.HTTPHEADER,self.httpheader)
19ifself.referer=='':
20curl.setopt(pycurl.AUTOREFERER,1)
21else:
22curl.setopt(pycurl.REFERER,self.referer)
23curl.setopt(pycurl.FOLLOWLOCATION,1)
24curl.setopt(pycurl.MAXREDIRS,5)
25curl.setopt(pycurl.URL, url)
26curl.setopt(pycurl.WRITEFUNCTION, stream.write)
27curl.perform()
28curl.close()
29
30defrand_str(self):
31return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
32
33deftofile(self,url,filename):
34fp=open(filename,'w');
35self.fetch(url,fp)
36fp.close()
37returnTrue
38
39defhtml(self, url):
40sio=StringIO.StringIO()
41self.fetch(url,sio)
42reval=sio.getvalue()
43sio.close()
44returnreval
45
46defgethtml(url,get):
47printget.html(url)
48
49if__name__=="__main__":
50importtime,datetime
51dstart=datetime.datetime.now()
52get=spider()
53get.referer='http://www.google.com/search?source+get.rand_str()
54thread_pool=[]
55acc=Account(100)
56foriinrange(10):
57url="http://localhost/test.php?n="+str(i)
58th=threading.Thread(target=gethtml,args=(url,get))
59thread_pool.append(th)
60foriinrange(10):
61thread_pool[i].start()
62foriinrange(10):
63threading.Thread.join(thread_pool[i])
64dend=datetime.datetime.now()
65print"Time span:", dend-dstart;

WDPYSPIDER类(支持,多线程,代理,登陆验证,POST)

Python

001#coding:utf-8
002importpycurl
003importurllib
004importthreading
005importStringIO
006importstring
007importrandom
008classspider:
009'''WDPYSPIDER(Whiledo Python Spider Class) 采集类
010
011@author HzqGhost admin@whiledo.com QQ:313143468
012get = spider()
013get.referer = 'http://www.google.com/search?source+get.rand_str()
014get.proxyuse = True
015get.proxyip = ['059148233056.ctinets.com:80']
016url = "http://www.whiledo.com"
017print get.html(url=url)'''
018def__init__(self):
019self.httpheader=[
020'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
021,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
022]#http头信息
023self.referer=''#伪造来源路径
024self.connnecttimeout=60#获取联接超时(秒)
025self.timeout=300#读定超时(秒)
026self.backheader=0#是否返回服务器http头信息(一般用于测试)
027self.cookesfile="./cookes.dat"#cookesfile 自动读写处理文件
028self.proxyuse=False#是否使用代理服务器
029self.proxyip=[]#代理服务器[IP:PORT]列表,随机使用列表中的IP
030self.proxynodomain=['localhost','127.0.0.1']#不使用代理服务器的域
031self.http200alias=[]#200返回信息别名列表
032self.error='WDPYERROR'#非200状态时返回的错误标识
033def__del__(self):
034pass
035
036deffetch(self,url,stream, post={}):
037'''
038--url
039--stream [stream] StringIO or fp
040--post [dict] {'username':'hzq','password':'blog'}'''
041curl=pycurl.Curl()
042curl.setopt(pycurl.CONNECTTIMEOUT,self.connnecttimeout)
043curl.setopt(pycurl.TIMEOUT,self.timeout)
044curl.setopt(pycurl.HTTPHEADER,self.httpheader)
045curl.setopt(pycurl.HTTP200ALIASES,self.http200alias)
046curl.setopt(pycurl.HEADER,self.backheader)
047curl.setopt(pycurl.FOLLOWLOCATION,1)
048curl.setopt(pycurl.MAXREDIRS,5)
049ifself.referer=='':
050curl.setopt(pycurl.AUTOREFERER,1)
051else:
052curl.setopt(pycurl.REFERER,self.referer)
053curl.setopt(pycurl.COOKIEJAR,self.cookesfile)
054curl.setopt(pycurl.COOKIEFILE,self.cookesfile)
055curl.setopt(pycurl.WRITEFUNCTION, stream.write)
056curl.setopt(pycurl.URL, url)
057ifself.proxyuse:
058proxyip=self.proxyip[random.randint(0,len(self.proxyip)-1)];
059curl.setopt(pycurl.PROXY, proxyip)
060#curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本
061iflen(post)>0:
062curl.setopt(pycurl.POSTFIELDS, post)
063status=''
064try:
065curl.perform()
066status=curl.getinfo(pycurl.RESPONSE_CODE)
067except:
068status=curl.errstr()
069finally:
070curl.close()
071status=str(status);
072ifstatus !='200':
073status=self.error
074returnstatus;
075
076defrand_str(self):
077return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
078
079deftofile(self,url,filename, post={}):
080fp=open(filename,'wb');
081self.fetch(url,fp,post)
082fp.close()
083returnTrue
084
085defhtml(self, url, post={}):
086sio=StringIO.StringIO()
087reval=self.fetch(url,sio, post)
088ifreval=='200':
089reval=sio.getvalue()
090sio.close()
091returnreval
092
093defgethtml(url,get):
094printget.html(url)
095
096if__name__=="__main__":
097get=spider()
098get.referer='http://www.google.com/search?source+get.rand_str()
099get.proxyuse=True
100get.proxyip=['059148233056.ctinets.com:80']
101url="http://www.whiledo.com"
102printget.html(url=url)

Python

1importurllib
2urlItem=urllib.urlopen("http://www.baidu.com")
3htmSource=urlItem.read()
4urlItem.close()
5printhtmSource

pycurl

http://pycurl.sourceforge.net/download/

http://pycurl.sourceforge.net/doc/curlobject.html

Python

01importpycurl
02c=pycurl.Curl()
03c.setopt(pycurl.URL,"http://www.whiledo.com/")
04c.setopt(pycurl.HTTPHEADER, ["Accept:"])
05importStringIO
06b=StringIO.StringIO()
07c.setopt(pycurl.WRITEFUNCTION, b.write)
08c.setopt(pycurl.FOLLOWLOCATION,1)
09c.setopt(pycurl.MAXREDIRS,5)
10c.perform()
11printb.getvalue()
12printc.getinfo(pycurl.INFO_FILETIME)

curl_easy_setopt

告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向(默认)
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下,libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELISTCURLOPT_HTTPGETCURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证(用户+密码)
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小(以字节为单位)
(认证)
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库,要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:
  • CURLAUTH_BASIC: HTTP基本验证
  • CURLAUTH_DIGEST: HTTP摘要身份验证
  • CURLAUTH_DIGEST_IE:
  • CURLAUTH_GSSNEGOTIATE: Kerberos5认证 要建立GSS - API
  • CURLAUTH_NTLM: NTLM身份验证
  • CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
  • CURLAUTH_ANYSAFE: 设置基本选项....
  • CURLAUTH_ONLY: 强制所有请求使用验证

getinfo

CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE
CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL
CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码
CURLINFO_FILETIME:
CURLINFO_TOTAL_TIME:
CURLINFO_CONNECT_TIME:
CURLINFO_NUM_CONNECTS: 多少个连接
CURLINFO_CONTENT_TYPE: 例:text/html
CURLINFO_REQUEST_SIZE:
CURLINFO_HEADER_SIZE:
CURLINFO_SIZE_DOWNLOAD: 下载总字节量
CURLINFO_SIZE_UPLOAD:
CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证
CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证
CURLINFO_COOKIELIST:
部份使用 INFO_ 如:INFO_COOKIELIST

一个粗糙的共用对象的采集示例

Python

01importpycurl
02importStringIO
03importstring
04importrandom
05classspider:
06def__init__(self,addHeader=[]):
07self.httpheader=[
08'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
09#,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
10]+addHeader
11self.curl=pycurl.Curl()
12self.curl.setopt(pycurl.HTTPHEADER,self.httpheader)
13self.curl.setopt(pycurl.REFERER,'http://www.google.com/search?source+self.rand_str())
14#self.curl.setopt(pycurl.AUTOREFERER, 1)
15self.curl.setopt(pycurl.FOLLOWLOCATION,1)
16self.curl.setopt(pycurl.MAXREDIRS,5)
17
18def__del__(self):
19pass
20
21defrand_str(self):
22return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
23
24deftofile(self,url,filename):
25fp=open(filename,'w');
26self.curl.setopt(pycurl.URL, url)
27self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)
28self.curl.perform()
29fp.close()
30returnTrue
31
32defhtml(self, url):
33sio=StringIO.StringIO()
34self.curl.setopt(pycurl.URL, url)
35self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)
36self.curl.perform()
37reval=sio.getvalue()
38sio.close()
39returnreval
40
41if__name__=="__main__":
42get=spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])
43printget.html("http://localhost/spider_for_test.php")
44printget.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')

一个多线程的采集示例

Python

01importpycurl
02importthreading
03importStringIO
04importstring
05importrandom
06classspider:
07def__init__(self,referer='',httpheader=[]):
08self.httpheader=[
09'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
10,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
11]+httpheader
12self.referer=referer
13def__del__(self):
14pass
15
16deffetch(self,url,stream):
17curl=pycurl.Curl()
18curl.setopt(pycurl.HTTPHEADER,self.httpheader)
19ifself.referer=='':
20curl.setopt(pycurl.AUTOREFERER,1)
21else:
22curl.setopt(pycurl.REFERER,self.referer)
23curl.setopt(pycurl.FOLLOWLOCATION,1)
24curl.setopt(pycurl.MAXREDIRS,5)
25curl.setopt(pycurl.URL, url)
26curl.setopt(pycurl.WRITEFUNCTION, stream.write)
27curl.perform()
28curl.close()
29
30defrand_str(self):
31return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
32
33deftofile(self,url,filename):
34fp=open(filename,'w');
35self.fetch(url,fp)
36fp.close()
37returnTrue
38
39defhtml(self, url):
40sio=StringIO.StringIO()
41self.fetch(url,sio)
42reval=sio.getvalue()
43sio.close()
44returnreval
45
46defgethtml(url,get):
47printget.html(url)
48
49if__name__=="__main__":
50importtime,datetime
51dstart=datetime.datetime.now()
52get=spider()
53get.referer='http://www.google.com/search?source+get.rand_str()
54thread_pool=[]
55acc=Account(100)
56foriinrange(10):
57url="http://localhost/test.php?n="+str(i)
58th=threading.Thread(target=gethtml,args=(url,get))
59thread_pool.append(th)
60foriinrange(10):
61thread_pool[i].start()
62foriinrange(10):
63threading.Thread.join(thread_pool[i])
64dend=datetime.datetime.now()
65print"Time span:", dend-dstart;

WDPYSPIDER类(支持,多线程,代理,登陆验证,POST)

Python

001#coding:utf-8
002importpycurl
003importurllib
004importthreading
005importStringIO
006importstring
007importrandom
008classspider:
009'''WDPYSPIDER(Whiledo Python Spider Class) 采集类
010
011@author HzqGhost admin@whiledo.com QQ:313143468
012get = spider()
013get.referer = 'http://www.google.com/search?source+get.rand_str()
014get.proxyuse = True
015get.proxyip = ['059148233056.ctinets.com:80']
016url = "http://www.whiledo.com"
017print get.html(url=url)'''
018def__init__(self):
019self.httpheader=[
020'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
021,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
022]#http头信息
023self.referer=''#伪造来源路径
024self.connnecttimeout=60#获取联接超时(秒)
025self.timeout=300#读定超时(秒)
026self.backheader=0#是否返回服务器http头信息(一般用于测试)
027self.cookesfile="./cookes.dat"#cookesfile 自动读写处理文件
028self.proxyuse=False#是否使用代理服务器
029self.proxyip=[]#代理服务器[IP:PORT]列表,随机使用列表中的IP
030self.proxynodomain=['localhost','127.0.0.1']#不使用代理服务器的域
031self.http200alias=[]#200返回信息别名列表
032self.error='WDPYERROR'#非200状态时返回的错误标识
033def__del__(self):
034pass
035
036deffetch(self,url,stream, post={}):
037'''
038--url
039--stream [stream] StringIO or fp
040--post [dict] {'username':'hzq','password':'blog'}'''
041curl=pycurl.Curl()
042curl.setopt(pycurl.CONNECTTIMEOUT,self.connnecttimeout)
043curl.setopt(pycurl.TIMEOUT,self.timeout)
044curl.setopt(pycurl.HTTPHEADER,self.httpheader)
045curl.setopt(pycurl.HTTP200ALIASES,self.http200alias)
046curl.setopt(pycurl.HEADER,self.backheader)
047curl.setopt(pycurl.FOLLOWLOCATION,1)
048curl.setopt(pycurl.MAXREDIRS,5)
049ifself.referer=='':
050curl.setopt(pycurl.AUTOREFERER,1)
051else:
052curl.setopt(pycurl.REFERER,self.referer)
053curl.setopt(pycurl.COOKIEJAR,self.cookesfile)
054curl.setopt(pycurl.COOKIEFILE,self.cookesfile)
055curl.setopt(pycurl.WRITEFUNCTION, stream.write)
056curl.setopt(pycurl.URL, url)
057ifself.proxyuse:
058proxyip=self.proxyip[random.randint(0,len(self.proxyip)-1)];
059curl.setopt(pycurl.PROXY, proxyip)
060#curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本
061iflen(post)>0:
062curl.setopt(pycurl.POSTFIELDS, post)
063status=''
064try:
065curl.perform()
066status=curl.getinfo(pycurl.RESPONSE_CODE)
067except:
068status=curl.errstr()
069finally:
070curl.close()
071status=str(status);
072ifstatus !='200':
073status=self.error
074returnstatus;
075
076defrand_str(self):
077return''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'],6))
078
079deftofile(self,url,filename, post={}):
080fp=open(filename,'wb');
081self.fetch(url,fp,post)
082fp.close()
083returnTrue
084
085defhtml(self, url, post={}):
086sio=StringIO.StringIO()
087reval=self.fetch(url,sio, post)
088ifreval=='200':
089reval=sio.getvalue()
090sio.close()
091returnreval
092
093defgethtml(url,get):
094printget.html(url)
095
096if__name__=="__main__":
097get=spider()
098get.referer='http://www.google.com/search?source+get.rand_str()
099get.proxyuse=True
100get.proxyip=['059148233056.ctinets.com:80']
101url="http://www.whiledo.com"
102printget.html(url=url)