if self.delay>0and last_accessed isnotNone: sleep_secs=self.delay-(datetime.datetime.now()- last_accessed).seconds if sleep_secs>0: time.sleep(sleep_secs) print'sleep %d sec'%sleep_secs self.domains[domain]=datetime.datetime.now()
defget_links(html): print html webpage_regex =re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE) return webpage_regex.findall(html)
deflink_crawler(seed_url,link_regex,delay=1): throttle=Throttle(delay) crawl_queue=[seed_url] seen=set(crawl_queue) while crawl_queue: url=crawl_queue.pop() throttle.wait(url) html=download(url) for link in get_links(html): if re.search(link_regex,link): link=urlparse.urljoin(seed_url,link) if link notin seen: seen.add(link) crawl_queue.append(link)
<!-- Mobile Viewport Fix j.mp/mobileviewport & davidbcalhoun.com/2010/viewport-metatag device-width: Occupy full width of the screen in its current orientation initial-scale = 1.0 retains dimensions instead of zooming out if page height > device height user-scalable = yes allows the user to zoom in --> <metaname="viewport"content="width=device-width, initial-scale=1.0" />
<!-- All JavaScript at the bottom, except for Modernizr which enables HTML5 elements & feature detects --> <scriptsrc="/places/static/js/modernizr.custom.js"></script>
<!-- include stylesheets -->
<scripttype="text/javascript"><!-- // These variables are used by the web2py_ajax_init function in web2py_ajax.js (which is loaded below). var w2p_ajax_confirm_message = "Are you sure you want to delete this object?"; var w2p_ajax_disable_with_message = "Working..."; var w2p_ajax_date_format = "%Y-%m-%d"; var w2p_ajax_datetime_format = "%Y-%m-%d %H:%M:%S"; var ajax_error_500 = 'An error occured, please <ahref="/places/default/index">reload</a> the page' //--></script>
<metaname="tags"content="web2py, python, web scraping" /> <metaname="generator"content="Web2py Web Framework" /> <metaname="author"content="Richard Penman" /> <scriptsrc="/places/static/js/jquery.js"type="text/javascript"></script><linkhref="/places/static/css/calendar.css"rel="stylesheet"type="text/css" /><scriptsrc="/places/static/js/calendar.js"type="text/javascript"></script><scriptsrc="/places/static/js/web2py.js"type="text/javascript"></script><linkhref="/places/static/css/web2py.css"rel="stylesheet"type="text/css" /><linkhref="/places/static/css/bootstrap.min.css"rel="stylesheet"type="text/css" /><linkhref="/places/static/css/bootstrap-responsive.min.css"rel="stylesheet"type="text/css" /><linkhref="/places/static/css/style.css"rel="stylesheet"type="text/css" /><linkhref="/places/static/css/web2py_bootstrap.css"rel="stylesheet"type="text/css" />
<!-- uncomment here to load jquery-ui <link rel="stylesheet" href="http://ajax.googleapis.com/ajax/libs/jqueryui/1.10.3/themes/ui-lightness/jquery-ui.css" type="text/css" media="all" /> <script src="http://ajax.googleapis.com/ajax/libs/jqueryui/1.10.3/jquery-ui.min.js" type="text/javascript"></script> uncomment to load jquery-ui //--> <noscript><linkhref="/places/static/css/web2py_bootstrap_nojs.css"rel="stylesheet"type="text/css" /></noscript>
<!-- the next tag is necessary for bootstrap menus, do not remove --> <buttontype="button"class="btn btn-navbar"data-toggle="collapse"data-target=".nav-collapse"style="display:none;"> <spanclass="icon-bar"></span> <spanclass="icon-bar"></span> <spanclass="icon-bar"></span> </button>
<!-- The javascript ============================================= (Placed at the end of the document so the pages load faster) --> <scriptsrc="/places/static/js/bootstrap.min.js"></script> <scriptsrc="/places/static/js/web2py_bootstrap.js"></script> <!--[if lt IE 7 ]> <script src="/places/static/js/dd_belatedpng.js"></script> <script> DD_belatedPNG.fix('img, .png_bg'); //fix any <img> or .png_bg background-images </script> <![endif]--> </body> </html>
deflink_crawler(seed_url,link_regex,delay=1): throttle=Throttle(delay) crawl_queue=[seed_url] seen=set(crawl_queue) while crawl_queue: url=crawl_queue.pop() throttle.wait(url) html=download(url) for link in get_links(html): if re.search(link_regex,link): link=urlparse.urljoin(seed_url,link) if link notin seen: seen.add(link) crawl_queue.append(link)
deflink_crawler(seed_url,link_regex,delay=1,scrape_callback=None,max_depth=10): throttle=Throttle(delay) crawl_queue=[seed_url] seen=set(crawl_queue) while crawl_queue: url=crawl_queue.pop() throttle.wait(url) html=download(url) links=[] if scrape_callback: links.extend(scrape_callback(url,html) or [])
for link in links: if re.search(link_regex,link): link=urlparse.urljoin(seed_url,link) if link notin seen: seen.add(link) crawl_queue.append(link)
v1.5.2