source: proiecte/HadoopJUnit/hadoop-0.20.1/contrib/hod/hodlib/Common/miniHTMLParser.py @ 120

Last change on this file since 120 was 120, checked in by (none), 14 years ago

Added the mail files for the Hadoop JUNit Project

  • Property svn:executable set to *
File size: 1.4 KB
Line 
1#Licensed to the Apache Software Foundation (ASF) under one
2#or more contributor license agreements.  See the NOTICE file
3#distributed with this work for additional information
4#regarding copyright ownership.  The ASF licenses this file
5#to you under the Apache License, Version 2.0 (the
6#"License"); you may not use this file except in compliance
7#with the License.  You may obtain a copy of the License at
8
9#     http://www.apache.org/licenses/LICENSE-2.0
10
11#Unless required by applicable law or agreed to in writing, software
12#distributed under the License is distributed on an "AS IS" BASIS,
13#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14#See the License for the specific language governing permissions and
15#limitations under the License.
16import urllib, urlparse, re
17
18from HTMLParser import HTMLParser
19
20class miniHTMLParser( HTMLParser ):
21
22  viewedQueue = []
23  instQueue = []
24
25  def setBaseUrl(self, url):
26    self.baseUrl = url
27
28  def getNextLink( self ):
29    if self.instQueue == []:
30      return None
31    else:
32      return self.instQueue.pop(0)
33
34  def handle_starttag( self, tag, attrs ):
35    if tag == 'a':
36      newstr = urlparse.urljoin(self.baseUrl, str(attrs[0][1]))
37      if re.search('mailto', newstr) != None:
38        return
39
40      if (newstr in self.viewedQueue) == False:
41        self.instQueue.append( newstr )
42        self.viewedQueue.append( newstr )
43
44
45
Note: See TracBrowser for help on using the repository browser.