| 45 | |
| 46 | @shared_task(ignore_result=True, serializer='pickle', compression='zlib') |
| 47 | def crawl(url, seen=None): |
| 48 | print(f'crawling: {url}') |
| 49 | if not seen: |
| 50 | seen = BloomFilter(capacity=50000, error_rate=0.0001) |
| 51 | |
| 52 | with Timeout(5, False): |
| 53 | try: |
| 54 | response = requests.get(url) |
| 55 | except requests.exception.RequestError: |
| 56 | return |
| 57 | |
| 58 | location = domain(url) |
| 59 | wanted_urls = [] |
| 60 | for url_match in url_regex.finditer(response.text): |
| 61 | url = url_match.group(0) |
| 62 | # To not destroy the internet, we only fetch URLs on the same domain. |
| 63 | if url not in seen and location in domain(url): |
| 64 | wanted_urls.append(url) |
| 65 | seen.add(url) |
| 66 | |
| 67 | subtasks = group(crawl.s(url, seen) for url in wanted_urls) |
| 68 | subtasks.delay() |