| 52 | |
| 53 | @asyncio.coroutine |
| 54 | def process(self, url): |
| 55 | print('processing:', url) |
| 56 | |
| 57 | self.todo.remove(url) |
| 58 | self.busy.add(url) |
| 59 | session = aiohttp.ClientSession(connector=self.connector) |
| 60 | try: |
| 61 | resp = yield from session.request('get', url) |
| 62 | except Exception as exc: |
| 63 | print('...', url, 'has error', repr(str(exc))) |
| 64 | self.done[url] = False |
| 65 | else: |
| 66 | if (resp.status == 200 and |
| 67 | ('text/html' in resp.headers.get('content-type'))): |
| 68 | data = (yield from resp.read()).decode('utf-8', 'replace') |
| 69 | urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) |
| 70 | asyncio.Task(self.addurls([(u, url) for u in urls])) |
| 71 | |
| 72 | resp.close() |
| 73 | self.done[url] = True |
| 74 | finally: |
| 75 | session.close() |
| 76 | |
| 77 | self.busy.remove(url) |
| 78 | print(len(self.done), 'completed tasks,', len(self.tasks), |
| 79 | 'still pending, todo', len(self.todo)) |
| 80 | |
| 81 | |
| 82 | def main(): |