From 8ea49e9403b141bd10fd8faa387749ea210b1d4a Mon Sep 17 00:00:00 2001
From: Grant Smith <57376089+ggsmith842@users.noreply.github.com>
Date: Wed, 26 Jun 2024 15:14:42 -0600
Subject: [PATCH 1/7] Cookbook Extract Links from HTML

---
 .../cookbook/extract-links-from-html/00-Re.ml | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 data/cookbook/extract-links-from-html/00-Re.ml
diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml
new file mode 100644
index 0000000000..75f2b4534c
--- /dev/null
+++ b/data/cookbook/extract-links-from-html/00-Re.ml
@@ -0,0 +1,68 @@
+---
+packages:
+  - name: "re"
+    tested_version: "1.11.0"
+    used_libraries:
+      - re
+---
+
+(*
+
+Given an HTML document or string we can use the `re` library to create a regular expression that finds the href tags containing web links.
+For example in the sample below we would expect to find three links for this HTML document.
+
+Sample HTML: 
+
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Sample HTML Page</title>
+</head>
+<body>
+    <header>
+        <h1>My Cool Learning Links</h1>
+    </header>
+    <main>
+        <section>
+            <H2>Click a link to get started!</H2>
+            <ul>
+                <li><a href="https://ocaml.org/docs">The Ocaml.org Learning Page</a></li>
+                <li><a href="https://pola.rs/">Pola.rs: Modern Python Dataframes</a></li>
+                <li><a href="https://www.nonexistentwebsite.com">It used to work.com</a></li>
+            </ul>
+        </section>
+    </main>
+</body>
+</html>
+
+`find_links` accepts an argument `html_content` of type string that contains our HTML content.
+
+Using `Re.Perl.re` create a Perl flavored regular expression that searches for the `a href` tags. You can view the pattern using  
+to understand more about what is going on. [Regex101](https://regex101.com/r/2Bs442/1)
+
+`Re.all` searches the entire `html_content` string for the `pattern`.
+
+We then pipe the output to `List.map` since `Re.all` returns a list of all matches and apply the `Re.group.get` function to each group
+in the list. Passing `1` we get the substring versus the entire matching group. This way we only get the url link and not the entire html tag.
+
+`List.iter` iterates through the `links` list and prints the urls.
+
+*)
+let find_links html_content = 
+  let pattern = Re.compile (Re.Perl.re "<a[^>]* href=\"([^\"]*)") in
+  let links = Re.all pattern html_content 
+  |> List.map (fun group -> Re.Group.get group 1) in
+  List.iter print_endline links
+
+
+(*
+Example usage:
+First, define helper function `read_file` to read in html content at the path. Note: you may need to include `Open Stdlib` to access `In_channel`
+*)
+let read_file file = 
+  In_channel.with_open_text file In_channel.input_all
+
+  
+(* Open the html file using `read_file` and find the links using the `find_links` function. *)
+let () = find_links (read_file "lib/webhtml/index.html")

From 3877f3f3b195c5e7a64e84a3d18e1282ba102e87 Mon Sep 17 00:00:00 2001
From: Christine Rose <christinerose@users.noreply.github.com>
Date: Thu, 27 Jun 2024 02:15:33 -0700
Subject: [PATCH 2/7] formatting, grammar, verb agreement, etc.

---
 data/cookbook/extract-links-from-html/00-Re.ml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml
index 75f2b4534c..72bfc0ef17 100644
--- a/data/cookbook/extract-links-from-html/00-Re.ml
+++ b/data/cookbook/extract-links-from-html/00-Re.ml
@@ -8,8 +8,8 @@ packages:
 
 (*
 
-Given an HTML document or string we can use the `re` library to create a regular expression that finds the href tags containing web links.
-For example in the sample below we would expect to find three links for this HTML document.
+Given an HTML document or string, we can use the `re` library to create a regular expression that finds the `href` tags containing web links.
+For example, in the sample below we would expect to find three links for this HTML document.
 
 Sample HTML: 
 
@@ -43,10 +43,10 @@ to understand more about what is going on. [Regex101](https://regex101.com/r/2Bs
 
 `Re.all` searches the entire `html_content` string for the `pattern`.
 
-We then pipe the output to `List.map` since `Re.all` returns a list of all matches and apply the `Re.group.get` function to each group
-in the list. Passing `1` we get the substring versus the entire matching group. This way we only get the url link and not the entire html tag.
+We then pipe the output to `List.map`, since `Re.all` returns a list of all matches and applies the `Re.group.get` function to each group
+in the list. Passing `1` we get the substring versus the entire matching group. This way we only get the URL link and not the entire HTML tag.
 
-`List.iter` iterates through the `links` list and prints the urls.
+`List.iter` iterates through the `links` list and prints the URLs.
 
 *)
 let find_links html_content = 
@@ -58,11 +58,11 @@ let find_links html_content =
 
 (*
 Example usage:
-First, define helper function `read_file` to read in html content at the path. Note: you may need to include `Open Stdlib` to access `In_channel`
+First, define helper function `read_file` to read in HTML content at the path. Note: you may need to include `Open Stdlib` to access `In_channel`
 *)
 let read_file file = 
   In_channel.with_open_text file In_channel.input_all
 
   
-(* Open the html file using `read_file` and find the links using the `find_links` function. *)
+(* Open the HTML file using `read_file` and find the links using the `find_links` function. *)
 let () = find_links (read_file "lib/webhtml/index.html")

From e56dee5d15a6642fa70a307ee008e3871b8ddad4 Mon Sep 17 00:00:00 2001
From: Grant Smith <57376089+ggsmith842@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:16:56 -0600
Subject: [PATCH 3/7] Update data/cookbook/extract-links-from-html/00-Re.ml

Co-authored-by: Christine Rose <christinerose@users.noreply.github.com>
---
 data/cookbook/extract-links-from-html/00-Re.ml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml
index 72bfc0ef17..24396d27b9 100644
--- a/data/cookbook/extract-links-from-html/00-Re.ml
+++ b/data/cookbook/extract-links-from-html/00-Re.ml
@@ -38,8 +38,8 @@ Sample HTML:
 
 `find_links` accepts an argument `html_content` of type string that contains our HTML content.
 
-Using `Re.Perl.re` create a Perl flavored regular expression that searches for the `a href` tags. You can view the pattern using  
-to understand more about what is going on. [Regex101](https://regex101.com/r/2Bs442/1)
+Use `Re.Perl.re` to create a Perl flavored regular expression that searches for the `a href` tags. You can view the pattern using [Regex101](https://regex101.com/r/2Bs442/1)
+to understand more about what is going on. 
 
 `Re.all` searches the entire `html_content` string for the `pattern`.
 

From 4b294627a5af41d555b754c29226e55b9929c330 Mon Sep 17 00:00:00 2001
From: Grant Smith <57376089+ggsmith842@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:17:07 -0600
Subject: [PATCH 4/7] Update data/cookbook/extract-links-from-html/00-Re.ml

Co-authored-by: Christine Rose <christinerose@users.noreply.github.com>
---
 data/cookbook/extract-links-from-html/00-Re.ml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml
index 24396d27b9..a88b73d5ed 100644
--- a/data/cookbook/extract-links-from-html/00-Re.ml
+++ b/data/cookbook/extract-links-from-html/00-Re.ml
@@ -58,7 +58,7 @@ let find_links html_content =
 
 (*
 Example usage:
-First, define helper function `read_file` to read in HTML content at the path. Note: you may need to include `Open Stdlib` to access `In_channel`
+First, define the helper function's `read_file` in order to read it in HTML content at the path. Note: you may need to include `Open Stdlib` to access `In_channel`.
 *)
 let read_file file = 
   In_channel.with_open_text file In_channel.input_all

From da5c2eb90f1cb4121e23a00d034ff74119813194 Mon Sep 17 00:00:00 2001
From: Grant Smith <57376089+ggsmith842@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:41:20 -0600
Subject: [PATCH 5/7] Update 00-Re.ml

Simplified comments and removed `read_file` section. It is now replaced with an HTML string. I also removed the sample HTML that would render for simplicity.
---
 .../cookbook/extract-links-from-html/00-Re.ml | 65 +++++++++----------
 1 file changed, 29 insertions(+), 36 deletions(-)

diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml
index a88b73d5ed..bb8444050e 100644
--- a/data/cookbook/extract-links-from-html/00-Re.ml
+++ b/data/cookbook/extract-links-from-html/00-Re.ml
@@ -8,15 +8,29 @@ packages:
 
 (*
 
-Given an HTML document or string, we can use the `re` library to create a regular expression that finds the `href` tags containing web links.
-For example, in the sample below we would expect to find three links for this HTML document.
+`find_links` accepts an argument `html_content` of type string that contains our HTML content and returns
+the content of the `href` tags.
 
-Sample HTML: 
+You can view the pattern using [Regex101](https://regex101.com/r/2Bs442/1)
+to understand more about what is going on. 
+
+`Re.all` searches the entire `html_content` string for the `pattern`. Passing `1` to `Re.Group.get` returns the 
+substring versus the entire matching group.
 
+*)
+let find_links html_content = 
+  let pattern = Re.compile (Re.Perl.re "<a[^>]* href=\"([^\"]*)") in
+  let links = Re.all pattern html_content 
+  |> List.map (fun group -> Re.Group.get group 1) in
+  List.iter print_endline links
+
+
+(* Example usage *)
+let html_content = "
 <!DOCTYPE html>
-<html lang="en">
+<html lang=\"en\">
 <head>
-    <meta charset="UTF-8">
+    <meta charset=\"UTF-8\">
     <title>Sample HTML Page</title>
 </head>
 <body>
@@ -27,42 +41,21 @@ Sample HTML:
         <section>
             <H2>Click a link to get started!</H2>
             <ul>
-                <li><a href="https://ocaml.org/docs">The Ocaml.org Learning Page</a></li>
-                <li><a href="https://pola.rs/">Pola.rs: Modern Python Dataframes</a></li>
-                <li><a href="https://www.nonexistentwebsite.com">It used to work.com</a></li>
+                <li><a href=\"https://ocaml.org/docs\">The Ocaml.org Learning Page</a></li>
+                <li><a href=\"https://pola.rs/\">Pola.rs: Modern Python Dataframes</a></li>
+                <li><a href=\"https://www.nonexistentwebsite.com\">It used to work.com</a></li>
             </ul>
         </section>
     </main>
 </body>
-</html>
-
-`find_links` accepts an argument `html_content` of type string that contains our HTML content.
-
-Use `Re.Perl.re` to create a Perl flavored regular expression that searches for the `a href` tags. You can view the pattern using [Regex101](https://regex101.com/r/2Bs442/1)
-to understand more about what is going on. 
-
-`Re.all` searches the entire `html_content` string for the `pattern`.
-
-We then pipe the output to `List.map`, since `Re.all` returns a list of all matches and applies the `Re.group.get` function to each group
-in the list. Passing `1` we get the substring versus the entire matching group. This way we only get the URL link and not the entire HTML tag.
-
-`List.iter` iterates through the `links` list and prints the URLs.
-
-*)
-let find_links html_content = 
-  let pattern = Re.compile (Re.Perl.re "<a[^>]* href=\"([^\"]*)") in
-  let links = Re.all pattern html_content 
-  |> List.map (fun group -> Re.Group.get group 1) in
-  List.iter print_endline links
+</html>"  
+  
+(*Expected output:
 
+https://ocaml.org/docs
+https://pola.rs/
+https://www.nonexistentwebsite.com
 
-(*
-Example usage:
-First, define the helper function's `read_file` in order to read it in HTML content at the path. Note: you may need to include `Open Stdlib` to access `In_channel`.
 *)
-let read_file file = 
-  In_channel.with_open_text file In_channel.input_all
+let () = find_links html_content
 
-  
-(* Open the HTML file using `read_file` and find the links using the `find_links` function. *)
-let () = find_links (read_file "lib/webhtml/index.html")

From e2db468a2fc8c8468dcb1c9d5675f8f20562aca7 Mon Sep 17 00:00:00 2001
From: Grant Smith <57376089+ggsmith842@users.noreply.github.com>
Date: Thu, 4 Jul 2024 14:27:34 -0600
Subject: [PATCH 6/7] Update and rename 00-Re.ml to 00-lambdasoup.ml

Changed from using regular expression to lambdasoup due to concerns raised about issues working with HTML using regular expressions.
---
 .../{00-Re.ml => 00-lambdasoup.ml}            | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)
 rename data/cookbook/extract-links-from-html/{00-Re.ml => 00-lambdasoup.ml} (64%)

diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-lambdasoup.ml
similarity index 64%
rename from data/cookbook/extract-links-from-html/00-Re.ml
rename to data/cookbook/extract-links-from-html/00-lambdasoup.ml
index bb8444050e..94ecb45fe9 100644
--- a/data/cookbook/extract-links-from-html/00-Re.ml
+++ b/data/cookbook/extract-links-from-html/00-lambdasoup.ml
@@ -1,9 +1,11 @@
 ---
 packages:
-  - name: "re"
-    tested_version: "1.11.0"
+  - name: "lambdasoup"
+    tested_version: "1.0.0"
     used_libraries:
-      - re
+      - lambdasoup
+discussion: |
+  - **Refernce:** The lambdasoup package provides a robust toolset for working with HTML. [github.com/lambdasoup](https://github.com/aantron/lambdasoup?tab=readme-ov-file)
 ---
 
 (*
@@ -11,18 +13,16 @@ packages:
 `find_links` accepts an argument `html_content` of type string that contains our HTML content and returns
 the content of the `href` tags.
 
-You can view the pattern using [Regex101](https://regex101.com/r/2Bs442/1)
-to understand more about what is going on. 
+`parse` from the `Soup` library produces a document node representing the HTML string.
 
-`Re.all` searches the entire `html_content` string for the `pattern`. Passing `1` to `Re.Group.get` returns the 
-substring versus the entire matching group.
+`$$` selects the links in the document.
 
 *)
-let find_links html_content = 
-  let pattern = Re.compile (Re.Perl.re "<a[^>]* href=\"([^\"]*)") in
-  let links = Re.all pattern html_content 
-  |> List.map (fun group -> Re.Group.get group 1) in
-  List.iter print_endline links
+open Soup
+
+let find_links html_content =
+  let document_node = Soup.parse html_content in 
+  document_node $$ "a[href]" |> iter (fun a -> print_endline (R.attribute "href" a))
 
 
 (* Example usage *)
@@ -51,11 +51,9 @@ let html_content = "
 </html>"  
   
 (*Expected output:
-
 https://ocaml.org/docs
 https://pola.rs/
 https://www.nonexistentwebsite.com
-
 *)
 let () = find_links html_content
 

From d7ad4d81b42ee36226d46ffac5a973aae2a3e25b Mon Sep 17 00:00:00 2001
From: Grant Smith <57376089+ggsmith842@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:06:00 -0600
Subject: [PATCH 7/7] Update 00-lambdasoup.ml

change `Soup.parse` to `parse` update documentation on selector query `$$`
---
 data/cookbook/extract-links-from-html/00-lambdasoup.ml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/cookbook/extract-links-from-html/00-lambdasoup.ml b/data/cookbook/extract-links-from-html/00-lambdasoup.ml
index 94ecb45fe9..abd7232707 100644
--- a/data/cookbook/extract-links-from-html/00-lambdasoup.ml
+++ b/data/cookbook/extract-links-from-html/00-lambdasoup.ml
@@ -15,13 +15,13 @@ the content of the `href` tags.
 
 `parse` from the `Soup` library produces a document node representing the HTML string.
 
-`$$` selects the links in the document.
+`$$` selects nodes in the document using the selector query.
 
 *)
 open Soup
 
 let find_links html_content =
-  let document_node = Soup.parse html_content in 
+  let document_node = parse html_content in 
   document_node $$ "a[href]" |> iter (fun a -> print_endline (R.attribute "href" a))