update manifest and add readmes

apify · Sep 9, 2024 · 0a62f26 · 0a62f26
1 parent cdedd1d
commit 0a62f26
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 27 deletions.
diff --git a/templates/manifest.json b/templates/manifest.json
@@ -146,7 +146,7 @@
         },
         {
             "id": "python-empty",
-            "name": "python_empty",
+            "name": "python-empty",
             "label": "Empty Python project",
             "category": "python",
             "description": "Empty template with basic structure for the Actor with Apify SDK that allows you to easily add your own functionality.",
@@ -166,7 +166,7 @@
         },
         {
             "id": "python-standby",
-            "name": "python_standby",
+            "name": "python-standby",
             "label": "Standby Python project",
             "category": "python",
             "description": "Template with basic structure for an Actor using Standby mode that allows you to easily add your own functionality.",
@@ -185,6 +185,68 @@
                 "STARTER"
             ]
         },
+        {
+            "id": "python-crawlee-beautifulsoup",
+            "name": "python-crawlee-beautifulsoup",
+            "label": "Start with Python Crawlee and BeautifulSoup",
+            "category": "python",
+            "technologies": [
+                "crawlee",
+                "beautifulsoup"
+            ],
+            "description": "Crawl and scrape websites using Crawlee and BeautifulSoup. Start from a given start URLs, and store results to Apify dataset.",
+            "messages": {
+                "postCreate": "To install additional Python packages, you need to activate the virtual environment in the \".venv\" folder in the actor directory."
+            },
+            "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/python-crawlee-beautifulsoup.zip?raw=true",
+            "defaultRunOptions": {
+                "build": "latest",
+                "memoryMbytes": 1024,
+                "timeoutSecs": 3600
+            },
+            "aliases": [
+                "getting_started_crawlee_beautifulsoup"
+            ],
+            "showcaseFiles": [
+                "src/main.py",
+                "src/__main__.py"
+            ],
+            "useCases": [
+                "STARTER",
+                "WEB_SCRAPING"
+            ]
+        },
+        {
+            "id": "python-crawlee-playwright",
+            "name": "python-crawlee-playwright",
+            "label": "Start with Python Crawlee and Playwright",
+            "category": "python",
+            "technologies": [
+                "crawlee",
+                "playwright"
+            ],
+            "description": "Crawl and scrape websites using Crawlee and Playwright. Start from a given start URLs, and store results to Apify dataset.",
+            "messages": {
+                "postCreate": "To install additional Python packages, you need to activate the virtual environment in the \".venv\" folder in the actor directory."
+            },
+            "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/python-crawlee-playwright.zip?raw=true",
+            "defaultRunOptions": {
+                "build": "latest",
+                "memoryMbytes": 1024,
+                "timeoutSecs": 3600
+            },
+            "aliases": [
+                "getting_started_crawlee_playwright"
+            ],
+            "showcaseFiles": [
+                "src/main.py",
+                "src/__main__.py"
+            ],
+            "useCases": [
+                "STARTER",
+                "WEB_SCRAPING"
+            ]
+        },
         {
             "id": "js-start",
             "name": "getting_started_node",

diff --git a/templates/python-crawlee-beautifulsoup/README.md b/templates/python-crawlee-beautifulsoup/README.md
@@ -1,3 +1,25 @@
 ## Crawlee with BeautifulSoup
 
-TODO
+A template for [web scraping](https://apify.com/web-scraping) data from websites starting from provided URLs using Python. The starting URLs are passed through the Actor's input schema, defined by the [input schema](https://docs.apify.com/platform/actors/development/input-schema). The template uses [Crawlee for Python](https://crawlee.dev/python) for efficient web crawling,  handling each request through a user-defined handler that uses [Beautiful Soup](https://pypi.org/project/beautifulsoup4/) to extract data from the page. Enqueued URLs are managed in the [request queue](https://crawlee.dev/python/api/class/RequestQueue), and the extracted data is saved in a [dataset](https://crawlee.dev/python/api/class/Dataset) for easy access.
+
+## Included features
+
+- **[Apify SDK](https://docs.apify.com/sdk/python/)** - a toolkit for building Apify [Actors](https://apify.com/actors) in Python.
+- **[Crawlee for Python](https://crawlee.dev/python/)** - a web scraping and browser automation library.
+- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and validate a schema for your Actor's input.
+- **[Request queue](https://crawlee.dev/python/api/class/RequestQueue)** - manage the URLs you want to scrape in a queue.
+- **[Dataset](https://crawlee.dev/python/api/class/Dataset)** - store and access structured data extracted from web pages.
+- **[Beautiful Soup](https://pypi.org/project/beautifulsoup4/)** - a library for pulling data out of HTML and XML files.
+
+## Resources
+
+- [Video introduction to Python SDK](https://www.youtube.com/watch?v=C8DmvJQS3jk)
+- [Webinar introducing to Crawlee for Python](https://www.youtube.com/live/ip8Ii0eLfRY)
+- [Apify Python SDK documentation](https://docs.apify.com/sdk/python/)
+- [Crawlee for Python documentation](https://crawlee.dev/python/docs/quick-start)
+- [Python tutorials in Academy](https://docs.apify.com/academy/python)
+- [Integration with Make, GitHub, Zapier, Google Drive, and other apps](https://apify.com/integrations)
+- [Video guide on getting scraped data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM)
+- A short guide on how to build web scrapers using code templates:
+
+[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w)
diff --git a/templates/python-crawlee-beautifulsoup/src/main.py b/templates/python-crawlee-beautifulsoup/src/main.py
@@ -21,21 +21,13 @@ async def main() -> None:
         # Read the Actor input.
         actor_input = await Actor.get_input() or {}
         start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
+        start_urls_list = [url.get('url') for url in start_urls]
 
         # Exit if no start URLs are provided.
         if not start_urls:
             Actor.log.info('No start URLs specified in Actor input, exiting...')
             await Actor.exit()
 
-        # Prepare a list of starting requests.
-        start_requests = [
-            Request.from_url(
-                url=url.get('url'),
-                user_data={'depth': 0},  # Set initial crawl depth to 0.
-            )
-            for url in start_urls
-        ]
-
         # Create a crawler.
         crawler = BeautifulSoupCrawler(
             # Limit the crawl to max requests. Remove or increase it for crawling all links.
@@ -64,4 +56,4 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
             await context.enqueue_links()
 
         # Run the crawler with the starting requests.
-        await crawler.run(start_requests)
+        await crawler.run(start_urls_list)
diff --git a/templates/python-crawlee-playwright/README.md b/templates/python-crawlee-playwright/README.md
@@ -1,3 +1,25 @@
 ## Crawlee with Playwright
 
-TODO
+A template for [web scraping](https://apify.com/web-scraping) data from websites starting from provided URLs using Python. The starting URLs are passed through the Actor's input schema, defined by the [input schema](https://docs.apify.com/platform/actors/development/input-schema). The template uses [Crawlee for Python](https://crawlee.dev/python) for efficient web crawling, making requests via headless browser managed by [Playwright](https://playwright.dev/python/), and handling each request through a user-defined handler that uses [Playwright](https://playwright.dev/python/) API to extract data from the page. Enqueued URLs are managed in the [request queue](https://crawlee.dev/python/api/class/RequestQueue), and the extracted data is saved in a [dataset](https://crawlee.dev/python/api/class/Dataset) for easy access.
+
+## Included features
+
+- **[Apify SDK](https://docs.apify.com/sdk/python/)** - a toolkit for building Apify [Actors](https://apify.com/actors) in Python.
+- **[Crawlee for Python](https://crawlee.dev/python/)** - a web scraping and browser automation library.
+- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and validate a schema for your Actor's input.
+- **[Request queue](https://crawlee.dev/python/api/class/RequestQueue)** - manage the URLs you want to scrape in a queue.
+- **[Dataset](https://crawlee.dev/python/api/class/Dataset)** - store and access structured data extracted from web pages.
+- **[Playwright](https://playwright.dev/python/)** - a library for managing headless browsers.
+
+## Resources
+
+- [Video introduction to Python SDK](https://www.youtube.com/watch?v=C8DmvJQS3jk)
+- [Webinar introducing to Crawlee for Python](https://www.youtube.com/live/ip8Ii0eLfRY)
+- [Apify Python SDK documentation](https://docs.apify.com/sdk/python/)
+- [Crawlee for Python documentation](https://crawlee.dev/python/docs/quick-start)
+- [Python tutorials in Academy](https://docs.apify.com/academy/python)
+- [Integration with Make, GitHub, Zapier, Google Drive, and other apps](https://apify.com/integrations)
+- [Video guide on getting scraped data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM)
+- A short guide on how to build web scrapers using code templates:
+
+[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w)
diff --git a/templates/python-crawlee-playwright/src/main.py b/templates/python-crawlee-playwright/src/main.py
@@ -21,21 +21,13 @@ async def main() -> None:
         # Read the Actor input.
         actor_input = await Actor.get_input() or {}
         start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
+        start_urls_list = [url.get('url') for url in start_urls]
 
         # Exit if no start URLs are provided.
         if not start_urls:
             Actor.log.info('No start URLs specified in Actor input, exiting...')
             await Actor.exit()
 
-        # Prepare a list of starting requests.
-        start_requests = [
-            Request.from_url(
-                url=url.get('url'),
-                user_data={'depth': 0},  # Set initial crawl depth to 0.
-            )
-            for url in start_urls
-        ]
-
         # Create a crawler.
         crawler = PlaywrightCrawler(
             # Limit the crawl to max requests. Remove or increase it for crawling all links.
@@ -53,9 +45,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
             data = {
                 'url': context.request.url,
                 'title': await context.page.title(),
-                # 'h1s': [h1.text for h1 in context.soup.find_all('h1')],
-                # 'h2s': [h2.text for h2 in context.soup.find_all('h2')],
-                # 'h3s': [h3.text for h3 in context.soup.find_all('h3')],
+                'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
+                'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
+                'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
             }
 
             # Save the extracted data to the default dataset.
@@ -65,4 +57,4 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
             await context.enqueue_links()
 
         # Run the crawler with the starting requests.
-        await crawler.run(start_requests)
+        await crawler.run(start_urls_list)