Function crawl

examples/eventlet/webcrawler.py:47–68 · view source on GitHub ↗

(url, seen=None)

Source from the content-addressed store, hash-verified

45
46	@shared_task(ignore_result=True, serializer='pickle', compression='zlib')
47	def crawl(url, seen=None):
48	print(f'crawling: {url}')
49	if not seen:
50	seen = BloomFilter(capacity=50000, error_rate=0.0001)
51
52	with Timeout(5, False):
53	try:
54	response = requests.get(url)
55	except requests.exception.RequestError:
56	return
57
58	location = domain(url)
59	wanted_urls = []
60	for url_match in url_regex.finditer(response.text):
61	url = url_match.group(0)
62	# To not destroy the internet, we only fetch URLs on the same domain.
63	if url not in seen and location in domain(url):
64	wanted_urls.append(url)
65	seen.add(url)
66
67	subtasks = group(crawl.s(url, seen) for url in wanted_urls)
68	subtasks.delay()

nothing calls this directly

groupClass · 0.90

TimeoutClass · 0.85

domainFunction · 0.85

getMethod · 0.45

groupMethod · 0.45

addMethod · 0.45

sMethod · 0.45

delayMethod · 0.45

no test coverage detected