diff --git a/data/cookbook/extract-links-from-html/00-lambdasoup.ml b/data/cookbook/extract-links-from-html/00-lambdasoup.ml new file mode 100644 index 0000000000..abd7232707 --- /dev/null +++ b/data/cookbook/extract-links-from-html/00-lambdasoup.ml @@ -0,0 +1,59 @@ +--- +packages: + - name: "lambdasoup" + tested_version: "1.0.0" + used_libraries: + - lambdasoup +discussion: | + - **Refernce:** The lambdasoup package provides a robust toolset for working with HTML. [github.com/lambdasoup](https://github.com/aantron/lambdasoup?tab=readme-ov-file) +--- + +(* + +`find_links` accepts an argument `html_content` of type string that contains our HTML content and returns +the content of the `href` tags. + +`parse` from the `Soup` library produces a document node representing the HTML string. + +`$$` selects nodes in the document using the selector query. + +*) +open Soup + +let find_links html_content = + let document_node = parse html_content in + document_node $$ "a[href]" |> iter (fun a -> print_endline (R.attribute "href" a)) + + +(* Example usage *) +let html_content = " + + + + + Sample HTML Page + + +
+

My Cool Learning Links

+
+
+
+

Click a link to get started!

+ +
+
+ +" + +(*Expected output: +https://ocaml.org/docs +https://pola.rs/ +https://www.nonexistentwebsite.com +*) +let () = find_links html_content +