最近闲来无事,用python写了一个小爬虫,笔记下源码,写的非常简单,什么附加参数都没用.
import urllib.request import re import time #获取一个网址的内容 def getRawWebContent(weburl): try: response = urllib.request.urlopen(weburl); return response.read().decode("utf8"); except : return ""; #对网址的内容进行正则表达式匹配,返回匹配的set def parseContent(weburl, regex): return set(regex.findall(weburl)); if __name__ == '__main__': totalset = set(["https://www.taobao.com"]); parsedSet = set(); regex = re.compile("https://www.taobao.com/\S+/"); index = 0; while index < 10: #简单的测试下即可 for url in totalset: if url not in parsedSet: rawcontent = getRawWebContent(url); totalset |= parseContent(rawcontent, regex); parsedSet.add(url); index+=1; print("index:%d %s"%(index,url)); time.sleep(2); #每次抓取后休眠一段时间,防止被服务器卡掉 break; print("finish");
发表评论