摘要:本文主要向大家介绍了Python语言网络编程学习笔记(三),通过具体的内容向大家展示,希望对大家学习Python语言有所帮助。
本文主要向大家介绍了Python语言网络编程学习笔记(三),通过具体的内容向大家展示,希望对大家学习Python语言有所帮助。
第二部分web Service
二、解析html和xhtml
第七章 解析Html 和XHtml p151-p168
1.提取标题
代码:
#coding=utf-8
from HTMLParser import HTMLParser
import sys
class TitleParser(HTMLParser):
def __init__(self):
self.title=''
self.readingtitle=0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag =='title':
self.readingtitle = 1
def handle_data(self, data):
if self.readingtitle:
self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
def gettitle(self):
return self.title
fd = open(sys.argv[1])
tp = TitleParser()
tp.feed(fd.read())
print "Title is:",tp.gettitle()
运行结果:
D:\python\python.exe E:/code/python/unit7/basic_title.py
E:/code/python/unit7/faqs.html
Title is: Appendix?B. MySQL 5.6 Frequently Asked Questions
Process finished with exit code 0
注:从表中摘取数据,
或
2.改进 代码: #coding=utf-8 from HTMLParser import HTMLParser from htmlentitydefs import entitydefs import sys class TitleParser(HTMLParser): def __init__(self): self.title='' self.readingtitle=0 HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): if tag =='title': self.readingtitle = 1 def handle_data(self, data): if self.readingtitle: self.title += data def handle_endtag(self, tag): if tag == 'title': self.readingtitle = 0 def handle_entityref(self, name): if entitydefs.has_key(name): self.handle_data(entitydefs[name]) else: self.handle_data('&'+name+';') def gettitle(self): return self.title fd = open(sys.argv[1]) tp = TitleParser() tp.feed(fd.read()) print "Title is:",tp.gettitle() etitle.html this is my text. 运行结果一: D:\python\python.exe E:/code/python/unit7/basic_title.py E:/code/python/unit7/etitle.html Title is: Document Title Intro Process finished with exit code 0 运行结果二: D:\python\python.exe E:/code/python/unit7/etitle.py E:/code/python/unit7/etitle.html Title is: Document Title &Intro Process finished with exit code 0 当一个实体出现时,代码检查该实体是否可以识别,可以,转换为相应得知,否则输入流中的文字; 3.转换字符参考 代码: #coding=utf-8 from HTMLParser import HTMLParser from htmlentitydefs import entitydefs import sys class TitleParser(HTMLParser): def __init__(self): self.title='' self.readingtitle=0 HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): if tag =='title': self.readingtitle = 1 def handle_data(self, data): if self.readingtitle: self.title += data def handle_endtag(self, tag): if tag == 'title': self.readingtitle = 0 def handle_entityref(self, name): if entitydefs.has_key(name): self.handle_data(entitydefs[name]) else: self.handle_data('&'+name+';') def handle_charref(self, name): try: charnum=int(name) except ValueError: return if charnum<1 or="" charnum="">225: return self.handle_data(chr(charnum)) def gettitle(self): return self.title fd = open(sys.argv[1]) tp = TitleParser() tp.feed(fd.read()) print "Title is:",tp.gettitle() 4.处理不均衡的标签 代码: #coding=utf-8 from HTMLParser import HTMLParser from htmlentitydefs import entitydefs import sys,re class TitleParser(HTMLParser): def __init__(self): self.taglevels=[] self.handledtags=['title','ul','li'] self.processing=None HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): if len(self.taglevels) and self.taglevels[-1] == tag: self.handle_endtag(tag) self.taglevels.append(tag) if tag in self.handledtags: self.data = '' self.processing = tag if tag == 'ul': print"List start" def handle_data(self, data): if self.processing: self.data += data def handle_endtag(self, tag): if not tag in self.taglevels: return while len(self.taglevels): starttag = self.taglevels.pop() if starttag in self.handledtags: self.finishprocessing(starttag) if starttag == tag: break def cleanse(self): self.data = re.sub('\s+', ' ', self.data) def finishprocessing(self, tag): self.cleanse() if tag == 'title' and tag == self.processing: print "Dom title", self.data elif tag == 'ul': print "List ended" elif tag == 'li' and tag == self.processing: print "List item", self.data self.processing = None def gettitle(self): return self.title 处理特殊值,如果在映射表中有对应的,即采用映射的值,否则为字面值 def handle_entityref(self, name): if entitydefs.has_key(name): self.handle_data(entitydefs[name]) else: self.handle_data('&' + name + ';') def handle_charref(self, name): try: charnum = int(name) except ValueError: return if charnum < 1 or charnum > 255: return self.handle_data(chr(charnum)) fd = open(sys.argv[1]) tp = TitleParser() tp.feed(fd.read()) 运行结果: D:\python\python.exe E:/code/python/unit7/4un.py E:/code/python/unit7/4un.html Dom title DOCTYPE Title & Intro? List start List item First List item List item second list item List item second list item List ended Process finished with exit code 0 5.一个可以实际工作的例子 三、XML和XML-RPC P169-p190 展示XML文档:tree,event.基于事件的解析器可以扫描文档,事件解析器可以响应。 8.2 使用Dom 代码: #coding=utf-8 from xml.dom import minidom,Node def scanNode(node,level=0): msg = node.__class__.__name__ if node.nodeType == Node.ELEMENT_NODE: msg += ",tag" + node.tagName print " " * level * 4, msg if node.hasChildNodes: for child in node.childNodes: scanNode(child, level + 1) doc = minidom.parse("Sample.xml") scanNode(doc) 运行结果: D:\python\python.exe E:/code/python/unit8/un1.py Document > Element,tagbook > Text > Element,tagtitle > Text > Text > Element,tagauthor > Text > Element,tagname > Text > Element,tagfirst > Text > Text > Element,taglast > Text > Text > Text > Element,tagaffiliation > Text > Text > Text > Element,tagchapter > Text > Element,tagtitle > Text > Text > Element,tagpara > Text > Element,tagcompany > Text > Text > Text > Text Process finished with exit code 0 sample.xml I think widgets are great.you should buy lots of them from 2.使用dom完全解析 代码: #coding=utf-8 """ 将XML以文本形式重新格式化输出 1.使用Node的节点类型,判断下一步如何处理 2.对不同的节点名(tagName)进行相应的处理 """ from xml.dom import minidom, Node import re, textwrap class SampleScanner: def __init__(self, doc): for child in doc.childNodes: if child.nodeType == Node.ELEMENT_NODE and child.tagName == "book": """只处理book元素""" self.handleBook(child) def gettext(self, nodelist): """获取当前节点的文本, 1.如果当前的节点为TEXT_NODE,将文本追加到列表中 2.如果当前的节点不是TEXT_NODE,递归地调用gettext""" retlist = [] for node in nodelist: if node.nodeType == Node.TEXT_NODE: retlist.append(node.wholeText) elif node.hasChildNodes: retlist.append(self.gettext(node.childNodes)) return re.sub("\s+", " ", "".join(retlist)) def handleBook(self, node): """处理Book节点 1.如果不是ELEMENT_NODE,不予理睬 2.如果是title,直接打印出文本内容 3.如果是author,调用handleAuthor,继续处理节点 4.如果是chapter,调用handleChapter,继续处理节点 """ for child in node.childNodes: if child.nodeType != Node.ELEMENT_NODE: continue if child.tagName == "title": print "Book title is :", self.gettext(child.childNodes) if child.tagName == "author": self.handleAuthor(child) if child.tagName == "chapter": self.handleChapter(child) def handleAuthor(self, node): """处理Autho节点 1.如果不是ELEMENT_NODE,不予理睬 2.如果是name,调用handleAuthoerName,继续处理节点 3.如果是affiliation,调用gettext,并打印出来 """ for child in node.childNodes: if child.nodeType != Node.ELEMENT_NODE: continue if child.tagName == "name": self.handleAuthorName(child) elif child.tagName == "affiliation": print "Author affiliation:", self.gettext([child]) def handleAuthorName(self, node): """处理author.name节点 1.使用getElementsByTagName获得子节点 2.调用gettext得到子节点的文本,并打印处理 """ surname = self.gettext(node.getElementsByTagName("last")) givenname = self.gettext(node.getElementsByTagName("first")) print "Author Name:%s %s " % (surname, givenname) def handleChapter(self, node): """处理chapter节点 1.如果不是ELEMENT_NODE,不予理睬 2.如果是para,调用handlePara,继续处理 """ print "*** Start of Chapter %s,%s" % ( node.getAttribute("number"), self.gettext(node.getElementsByTagName("title"))) for child in node.childNodes: if child.nodeType != Node.ELEMENT_NODE: continue if child.tagName == "para": self.handlePara(child) def handlePara(self, node): """ 1.获取当前节点的文本 2.调用textwrap格式化文本 """ paratext = self.gettext([node]) paratext = textwrap.fill(paratext) print paratext doc = minidom.parse("Sample.xml") SampleScanner(doc) 运行结果: D:\python\python.exe E:/code/python/unit8/un2.py Book title is : Sample XML Thing Author Name:Smith Benjamin Author affiliation: Springy Widgets,Inc. *** Start of Chapter 1,First chapter I think widgets are great.you should buy lots of them from Springy widgets,Inc Process finished with exit code 0 3.使用Dom产生文档 代码: #coding=utf-8 """ 使用minidom生成XML 1.创建Element,createElement 2.添加子节点,appendChild 3.创建Text,createTextNode 4.创建属性,createAttribute """ from xml.dom import minidom,Node # 创建Document doc = minidom.Document() # 创建book节点 book = doc.createElement("book") doc.appendChild(book) # 创建Title节点 title = doc.createElement("title") text = doc.createTextNode("Sample XML Thing") title.appendChild(text) book.appendChild(title) # 创建author节点 author = doc.createElement("author") # 创建name节点 name = doc.createElement("name") first = doc.createElement("first") first.appendChild(doc.createTextNode("Benjamin")) name.appendChild(first) last = doc.createElement("last") last.appendChild(doc.createTextNode("Smith")) name.appendChild(last) author.appendChild(name) book.appendChild(author) # author节点完毕 # 创建chapter节点 chapter = doc.createElement("chapter") chapter.setAttribute("number","1") title = doc.createElement("title") title.appendChild(doc.createTextNode("Fisrt Chapter")) chapter.appendChild(title) para = doc.createElement("para") para.appendChild(doc.createTextNode("I think widgets are great.you should buy lots \ of them from")) company = doc.createElement("company") company.appendChild(doc.createTextNode("Springy widgets,Inc")) para.appendChild(company) chapter.appendChild(para) # chapter节点完毕 book.appendChild(chapter) # book节点完毕 print doc.toprettyxml(indent = " ") 运行结果: D:\python\python.exe E:/code/python/unit8/un3.py I think widgets are great.you should buy lots of them from Process finished with exit code 0 4.dom类型参考 8.3使用xml-rpc 5. 代码: #coding=utf-8 import xmlrpclib url='//liandesinian.blog.51cto.com/7737219/1565474' s=xmlrpclib.ServerProxy(url) catdata=s.meerkat.getCategories() cattiles=[item['title'] for item in catdata] cattiles.sort() for item in cattiles: print item 运行结果: D:\python\python.exe E:/code/python/unit8/un6.py Process finished with exit code 0 6. 代码: #coding=utf-8 import xmlrpclib,sys,textwrap class NewsCat: def __init__(self,catdata): self.id=catdata['id'] self.title=catdata['title'] def __cmp__(self, other): return cmp(self.title,other.title) class NewsSource: def __init__(self,url='//www.oreillynet.com/meerkat/xml-rpc/server.php'): self.s=xmlrpclib.ServerProxy(url) self.loadcats() def loadcats(self): print "Loading categories...." catdata=self.s.meerkat.getCatgries() self.cats=[NewsCat(item) for item in catdata] self.cat.sort() def displaycats(self): numonline=0 i=0 for item in self.cats: sys.stdout.write("%2d:%20.20s"%(i+1,item.title)) i+=1 numonline+=1 if numonline%3==0: sys.stdout.write("\n") if numonline!=0: sys.stdout.write("\n") def promotcat(self): sys.__displaycats() sys.stdout.write("select a catgory or q to quit") selection = sys.stdin.readline().strip() if selection == 'q': sys.exit(0) return int(selection) - 1 def dispact(self, cat): items = self.s.meerkat.getItems({'category': cat, 'ids': 1, 'descriptions': 1, 'categories': 1, 'channels': 1, 'data': 1, 'num_items': 15}) if not len(items): print "Sorry,no items in that category." sys.stdout.write("Press Enter to continue:") sys.stdin.readline() return while 1: print self.dispitemsummary(items) sys.stdout.write("select a catgory or q to quit") selection = sys.stdin.readline().strip() if selection=='q': return self.dispitem(items[int(selection)-1]) def dispitemsummary(self, items): counter = 0 for item in items: print "%2d:%s"(counter + 1, item['title']) counter += 1 def dispitem(self, item): print "---%s---" % item['title'] print "Posted on", item['data'] print "Description:" print textwrap.fill(item['description']) print "\nlink:", item['link'] sys.stdout.write("\nPress Enter to continue: ") sys.stdin.readline() n = NewsSource() while 1: cat = n.promotcat() n.dispact(cat) |
本文由职坐标整理并发布,希望对同学们学习Python有所帮助,更多内容请关注职坐标编程语言Python频道!
您输入的评论内容中包含违禁敏感词
我知道了
请输入正确的手机号码
请输入正确的验证码
您今天的短信下发次数太多了,明天再试试吧!
我们会在第一时间安排职业规划师联系您!
您也可以联系我们的职业规划师咨询:
版权所有 职坐标-一站式IT培训就业服务领导者 沪ICP备13042190号-4
上海海同信息科技有限公司 Copyright ©2015 www.zhizuobiao.com,All Rights Reserved.
沪公网安备 31011502005948号