Skip to content

Commit

Permalink
update manifest and add readmes
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Sep 9, 2024
1 parent cdedd1d commit 0a62f26
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 27 deletions.
66 changes: 64 additions & 2 deletions templates/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@
},
{
"id": "python-empty",
"name": "python_empty",
"name": "python-empty",
"label": "Empty Python project",
"category": "python",
"description": "Empty template with basic structure for the Actor with Apify SDK that allows you to easily add your own functionality.",
Expand All @@ -166,7 +166,7 @@
},
{
"id": "python-standby",
"name": "python_standby",
"name": "python-standby",
"label": "Standby Python project",
"category": "python",
"description": "Template with basic structure for an Actor using Standby mode that allows you to easily add your own functionality.",
Expand All @@ -185,6 +185,68 @@
"STARTER"
]
},
{
"id": "python-crawlee-beautifulsoup",
"name": "python-crawlee-beautifulsoup",
"label": "Start with Python Crawlee and BeautifulSoup",
"category": "python",
"technologies": [
"crawlee",
"beautifulsoup"
],
"description": "Crawl and scrape websites using Crawlee and BeautifulSoup. Start from a given start URLs, and store results to Apify dataset.",
"messages": {
"postCreate": "To install additional Python packages, you need to activate the virtual environment in the \".venv\" folder in the actor directory."
},
"archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/python-crawlee-beautifulsoup.zip?raw=true",
"defaultRunOptions": {
"build": "latest",
"memoryMbytes": 1024,
"timeoutSecs": 3600
},
"aliases": [
"getting_started_crawlee_beautifulsoup"
],
"showcaseFiles": [
"src/main.py",
"src/__main__.py"
],
"useCases": [
"STARTER",
"WEB_SCRAPING"
]
},
{
"id": "python-crawlee-playwright",
"name": "python-crawlee-playwright",
"label": "Start with Python Crawlee and Playwright",
"category": "python",
"technologies": [
"crawlee",
"playwright"
],
"description": "Crawl and scrape websites using Crawlee and Playwright. Start from a given start URLs, and store results to Apify dataset.",
"messages": {
"postCreate": "To install additional Python packages, you need to activate the virtual environment in the \".venv\" folder in the actor directory."
},
"archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/python-crawlee-playwright.zip?raw=true",
"defaultRunOptions": {
"build": "latest",
"memoryMbytes": 1024,
"timeoutSecs": 3600
},
"aliases": [
"getting_started_crawlee_playwright"
],
"showcaseFiles": [
"src/main.py",
"src/__main__.py"
],
"useCases": [
"STARTER",
"WEB_SCRAPING"
]
},
{
"id": "js-start",
"name": "getting_started_node",
Expand Down
24 changes: 23 additions & 1 deletion templates/python-crawlee-beautifulsoup/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
## Crawlee with BeautifulSoup

TODO
A template for [web scraping](https://apify.com/web-scraping) data from websites starting from provided URLs using Python. The starting URLs are passed through the Actor's input schema, defined by the [input schema](https://docs.apify.com/platform/actors/development/input-schema). The template uses [Crawlee for Python](https://crawlee.dev/python) for efficient web crawling, handling each request through a user-defined handler that uses [Beautiful Soup](https://pypi.org/project/beautifulsoup4/) to extract data from the page. Enqueued URLs are managed in the [request queue](https://crawlee.dev/python/api/class/RequestQueue), and the extracted data is saved in a [dataset](https://crawlee.dev/python/api/class/Dataset) for easy access.

## Included features

- **[Apify SDK](https://docs.apify.com/sdk/python/)** - a toolkit for building Apify [Actors](https://apify.com/actors) in Python.
- **[Crawlee for Python](https://crawlee.dev/python/)** - a web scraping and browser automation library.
- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and validate a schema for your Actor's input.
- **[Request queue](https://crawlee.dev/python/api/class/RequestQueue)** - manage the URLs you want to scrape in a queue.
- **[Dataset](https://crawlee.dev/python/api/class/Dataset)** - store and access structured data extracted from web pages.
- **[Beautiful Soup](https://pypi.org/project/beautifulsoup4/)** - a library for pulling data out of HTML and XML files.

## Resources

- [Video introduction to Python SDK](https://www.youtube.com/watch?v=C8DmvJQS3jk)
- [Webinar introducing to Crawlee for Python](https://www.youtube.com/live/ip8Ii0eLfRY)
- [Apify Python SDK documentation](https://docs.apify.com/sdk/python/)
- [Crawlee for Python documentation](https://crawlee.dev/python/docs/quick-start)
- [Python tutorials in Academy](https://docs.apify.com/academy/python)
- [Integration with Make, GitHub, Zapier, Google Drive, and other apps](https://apify.com/integrations)
- [Video guide on getting scraped data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM)
- A short guide on how to build web scrapers using code templates:

[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w)
12 changes: 2 additions & 10 deletions templates/python-crawlee-beautifulsoup/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,13 @@ async def main() -> None:
# Read the Actor input.
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
start_urls_list = [url.get('url') for url in start_urls]

# Exit if no start URLs are provided.
if not start_urls:
Actor.log.info('No start URLs specified in Actor input, exiting...')
await Actor.exit()

# Prepare a list of starting requests.
start_requests = [
Request.from_url(
url=url.get('url'),
user_data={'depth': 0}, # Set initial crawl depth to 0.
)
for url in start_urls
]

# Create a crawler.
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
Expand Down Expand Up @@ -64,4 +56,4 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await context.enqueue_links()

# Run the crawler with the starting requests.
await crawler.run(start_requests)
await crawler.run(start_urls_list)
24 changes: 23 additions & 1 deletion templates/python-crawlee-playwright/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
## Crawlee with Playwright

TODO
A template for [web scraping](https://apify.com/web-scraping) data from websites starting from provided URLs using Python. The starting URLs are passed through the Actor's input schema, defined by the [input schema](https://docs.apify.com/platform/actors/development/input-schema). The template uses [Crawlee for Python](https://crawlee.dev/python) for efficient web crawling, making requests via headless browser managed by [Playwright](https://playwright.dev/python/), and handling each request through a user-defined handler that uses [Playwright](https://playwright.dev/python/) API to extract data from the page. Enqueued URLs are managed in the [request queue](https://crawlee.dev/python/api/class/RequestQueue), and the extracted data is saved in a [dataset](https://crawlee.dev/python/api/class/Dataset) for easy access.

## Included features

- **[Apify SDK](https://docs.apify.com/sdk/python/)** - a toolkit for building Apify [Actors](https://apify.com/actors) in Python.
- **[Crawlee for Python](https://crawlee.dev/python/)** - a web scraping and browser automation library.
- **[Input schema](https://docs.apify.com/platform/actors/development/input-schema)** - define and validate a schema for your Actor's input.
- **[Request queue](https://crawlee.dev/python/api/class/RequestQueue)** - manage the URLs you want to scrape in a queue.
- **[Dataset](https://crawlee.dev/python/api/class/Dataset)** - store and access structured data extracted from web pages.
- **[Playwright](https://playwright.dev/python/)** - a library for managing headless browsers.

## Resources

- [Video introduction to Python SDK](https://www.youtube.com/watch?v=C8DmvJQS3jk)
- [Webinar introducing to Crawlee for Python](https://www.youtube.com/live/ip8Ii0eLfRY)
- [Apify Python SDK documentation](https://docs.apify.com/sdk/python/)
- [Crawlee for Python documentation](https://crawlee.dev/python/docs/quick-start)
- [Python tutorials in Academy](https://docs.apify.com/academy/python)
- [Integration with Make, GitHub, Zapier, Google Drive, and other apps](https://apify.com/integrations)
- [Video guide on getting scraped data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM)
- A short guide on how to build web scrapers using code templates:

[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w)
18 changes: 5 additions & 13 deletions templates/python-crawlee-playwright/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,13 @@ async def main() -> None:
# Read the Actor input.
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
start_urls_list = [url.get('url') for url in start_urls]

# Exit if no start URLs are provided.
if not start_urls:
Actor.log.info('No start URLs specified in Actor input, exiting...')
await Actor.exit()

# Prepare a list of starting requests.
start_requests = [
Request.from_url(
url=url.get('url'),
user_data={'depth': 0}, # Set initial crawl depth to 0.
)
for url in start_urls
]

# Create a crawler.
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
Expand All @@ -53,9 +45,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
data = {
'url': context.request.url,
'title': await context.page.title(),
# 'h1s': [h1.text for h1 in context.soup.find_all('h1')],
# 'h2s': [h2.text for h2 in context.soup.find_all('h2')],
# 'h3s': [h3.text for h3 in context.soup.find_all('h3')],
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
}

# Save the extracted data to the default dataset.
Expand All @@ -65,4 +57,4 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await context.enqueue_links()

# Run the crawler with the starting requests.
await crawler.run(start_requests)
await crawler.run(start_urls_list)

0 comments on commit 0a62f26

Please sign in to comment.