0，python 中关于下载的部分总结如下：

import urllib

if __name__=="__main__":

url = "http://www.mntuku.cn"

#根据 url 读取 html 源码

content = urllib.urlopen(url).read()

#转为中文可读，可以直接查看当前 html 源文件是什么编码格式，百度的是 gb2312

content = content.decode("gb2312").encode("utf-8")

print content

1，处理 A 标签字符串：

#!/usr/bin/python

#encoding=utf-8

import htmllib,urllib,formatter,string

'''

import chardet,sys

type = sys.getdefaultencoding()

'''

class GetLinks(htmllib.HTMLParser): #从 HTMLParser 类中继承

def __init__(self): #初始化的时候调用，将 links 设置为空。这里的 links 为字典结

构

self.links = {} #存放[地址->链接]的字典

f = formatter.NullFormatter()#将传输过来的数据不做处理，格式化为数据流

htmllib.HTMLParser.__init__(self, f)

def anchor_bgn(self, href, name, type): #锚点标签开始的时候处理

self.save_bgn()

self.link = href

def anchor_end(self): #锚点标签结束的时候处理

text = string.strip(self.save_end()) #去掉 A 标签保留 A 标签的信息

if self.link and text:

self.links[text] = self.link#self.links.get(text, []) + [self.lin