From 8ea49e9403b141bd10fd8faa387749ea210b1d4a Mon Sep 17 00:00:00 2001 From: Grant Smith <57376089+ggsmith842@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:14:42 -0600 Subject: [PATCH 1/7] Cookbook Extract Links from HTML --- .../cookbook/extract-links-from-html/00-Re.ml | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 data/cookbook/extract-links-from-html/00-Re.ml diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml new file mode 100644 index 0000000000..75f2b4534c --- /dev/null +++ b/data/cookbook/extract-links-from-html/00-Re.ml @@ -0,0 +1,68 @@ +--- +packages: + - name: "re" + tested_version: "1.11.0" + used_libraries: + - re +--- + +(* + +Given an HTML document or string we can use the `re` library to create a regular expression that finds the href tags containing web links. +For example in the sample below we would expect to find three links for this HTML document. + +Sample HTML: + + + + + + Sample HTML Page + + +
+

My Cool Learning Links

+
+
+
+

Click a link to get started!

+ +
+
+ + + +`find_links` accepts an argument `html_content` of type string that contains our HTML content. + +Using `Re.Perl.re` create a Perl flavored regular expression that searches for the `a href` tags. You can view the pattern using +to understand more about what is going on. [Regex101](https://regex101.com/r/2Bs442/1) + +`Re.all` searches the entire `html_content` string for the `pattern`. + +We then pipe the output to `List.map` since `Re.all` returns a list of all matches and apply the `Re.group.get` function to each group +in the list. Passing `1` we get the substring versus the entire matching group. This way we only get the url link and not the entire html tag. + +`List.iter` iterates through the `links` list and prints the urls. + +*) +let find_links html_content = + let pattern = Re.compile (Re.Perl.re "]* href=\"([^\"]*)") in + let links = Re.all pattern html_content + |> List.map (fun group -> Re.Group.get group 1) in + List.iter print_endline links + + +(* +Example usage: +First, define helper function `read_file` to read in html content at the path. Note: you may need to include `Open Stdlib` to access `In_channel` +*) +let read_file file = + In_channel.with_open_text file In_channel.input_all + + +(* Open the html file using `read_file` and find the links using the `find_links` function. *) +let () = find_links (read_file "lib/webhtml/index.html") From 3877f3f3b195c5e7a64e84a3d18e1282ba102e87 Mon Sep 17 00:00:00 2001 From: Christine Rose Date: Thu, 27 Jun 2024 02:15:33 -0700 Subject: [PATCH 2/7] formatting, grammar, verb agreement, etc. --- data/cookbook/extract-links-from-html/00-Re.ml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml index 75f2b4534c..72bfc0ef17 100644 --- a/data/cookbook/extract-links-from-html/00-Re.ml +++ b/data/cookbook/extract-links-from-html/00-Re.ml @@ -8,8 +8,8 @@ packages: (* -Given an HTML document or string we can use the `re` library to create a regular expression that finds the href tags containing web links. -For example in the sample below we would expect to find three links for this HTML document. +Given an HTML document or string, we can use the `re` library to create a regular expression that finds the `href` tags containing web links. +For example, in the sample below we would expect to find three links for this HTML document. Sample HTML: @@ -43,10 +43,10 @@ to understand more about what is going on. [Regex101](https://regex101.com/r/2Bs `Re.all` searches the entire `html_content` string for the `pattern`. -We then pipe the output to `List.map` since `Re.all` returns a list of all matches and apply the `Re.group.get` function to each group -in the list. Passing `1` we get the substring versus the entire matching group. This way we only get the url link and not the entire html tag. +We then pipe the output to `List.map`, since `Re.all` returns a list of all matches and applies the `Re.group.get` function to each group +in the list. Passing `1` we get the substring versus the entire matching group. This way we only get the URL link and not the entire HTML tag. -`List.iter` iterates through the `links` list and prints the urls. +`List.iter` iterates through the `links` list and prints the URLs. *) let find_links html_content = @@ -58,11 +58,11 @@ let find_links html_content = (* Example usage: -First, define helper function `read_file` to read in html content at the path. Note: you may need to include `Open Stdlib` to access `In_channel` +First, define helper function `read_file` to read in HTML content at the path. Note: you may need to include `Open Stdlib` to access `In_channel` *) let read_file file = In_channel.with_open_text file In_channel.input_all -(* Open the html file using `read_file` and find the links using the `find_links` function. *) +(* Open the HTML file using `read_file` and find the links using the `find_links` function. *) let () = find_links (read_file "lib/webhtml/index.html") From e56dee5d15a6642fa70a307ee008e3871b8ddad4 Mon Sep 17 00:00:00 2001 From: Grant Smith <57376089+ggsmith842@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:16:56 -0600 Subject: [PATCH 3/7] Update data/cookbook/extract-links-from-html/00-Re.ml Co-authored-by: Christine Rose --- data/cookbook/extract-links-from-html/00-Re.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml index 72bfc0ef17..24396d27b9 100644 --- a/data/cookbook/extract-links-from-html/00-Re.ml +++ b/data/cookbook/extract-links-from-html/00-Re.ml @@ -38,8 +38,8 @@ Sample HTML: `find_links` accepts an argument `html_content` of type string that contains our HTML content. -Using `Re.Perl.re` create a Perl flavored regular expression that searches for the `a href` tags. You can view the pattern using -to understand more about what is going on. [Regex101](https://regex101.com/r/2Bs442/1) +Use `Re.Perl.re` to create a Perl flavored regular expression that searches for the `a href` tags. You can view the pattern using [Regex101](https://regex101.com/r/2Bs442/1) +to understand more about what is going on. `Re.all` searches the entire `html_content` string for the `pattern`. From 4b294627a5af41d555b754c29226e55b9929c330 Mon Sep 17 00:00:00 2001 From: Grant Smith <57376089+ggsmith842@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:17:07 -0600 Subject: [PATCH 4/7] Update data/cookbook/extract-links-from-html/00-Re.ml Co-authored-by: Christine Rose --- data/cookbook/extract-links-from-html/00-Re.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml index 24396d27b9..a88b73d5ed 100644 --- a/data/cookbook/extract-links-from-html/00-Re.ml +++ b/data/cookbook/extract-links-from-html/00-Re.ml @@ -58,7 +58,7 @@ let find_links html_content = (* Example usage: -First, define helper function `read_file` to read in HTML content at the path. Note: you may need to include `Open Stdlib` to access `In_channel` +First, define the helper function's `read_file` in order to read it in HTML content at the path. Note: you may need to include `Open Stdlib` to access `In_channel`. *) let read_file file = In_channel.with_open_text file In_channel.input_all From da5c2eb90f1cb4121e23a00d034ff74119813194 Mon Sep 17 00:00:00 2001 From: Grant Smith <57376089+ggsmith842@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:41:20 -0600 Subject: [PATCH 5/7] Update 00-Re.ml Simplified comments and removed `read_file` section. It is now replaced with an HTML string. I also removed the sample HTML that would render for simplicity. --- .../cookbook/extract-links-from-html/00-Re.ml | 65 +++++++++---------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-Re.ml index a88b73d5ed..bb8444050e 100644 --- a/data/cookbook/extract-links-from-html/00-Re.ml +++ b/data/cookbook/extract-links-from-html/00-Re.ml @@ -8,15 +8,29 @@ packages: (* -Given an HTML document or string, we can use the `re` library to create a regular expression that finds the `href` tags containing web links. -For example, in the sample below we would expect to find three links for this HTML document. +`find_links` accepts an argument `html_content` of type string that contains our HTML content and returns +the content of the `href` tags. -Sample HTML: +You can view the pattern using [Regex101](https://regex101.com/r/2Bs442/1) +to understand more about what is going on. + +`Re.all` searches the entire `html_content` string for the `pattern`. Passing `1` to `Re.Group.get` returns the +substring versus the entire matching group. +*) +let find_links html_content = + let pattern = Re.compile (Re.Perl.re "]* href=\"([^\"]*)") in + let links = Re.all pattern html_content + |> List.map (fun group -> Re.Group.get group 1) in + List.iter print_endline links + + +(* Example usage *) +let html_content = " - + - + Sample HTML Page @@ -27,42 +41,21 @@ Sample HTML:

Click a link to get started!

- - -`find_links` accepts an argument `html_content` of type string that contains our HTML content. - -Use `Re.Perl.re` to create a Perl flavored regular expression that searches for the `a href` tags. You can view the pattern using [Regex101](https://regex101.com/r/2Bs442/1) -to understand more about what is going on. - -`Re.all` searches the entire `html_content` string for the `pattern`. - -We then pipe the output to `List.map`, since `Re.all` returns a list of all matches and applies the `Re.group.get` function to each group -in the list. Passing `1` we get the substring versus the entire matching group. This way we only get the URL link and not the entire HTML tag. - -`List.iter` iterates through the `links` list and prints the URLs. - -*) -let find_links html_content = - let pattern = Re.compile (Re.Perl.re "]* href=\"([^\"]*)") in - let links = Re.all pattern html_content - |> List.map (fun group -> Re.Group.get group 1) in - List.iter print_endline links +" + +(*Expected output: +https://ocaml.org/docs +https://pola.rs/ +https://www.nonexistentwebsite.com -(* -Example usage: -First, define the helper function's `read_file` in order to read it in HTML content at the path. Note: you may need to include `Open Stdlib` to access `In_channel`. *) -let read_file file = - In_channel.with_open_text file In_channel.input_all +let () = find_links html_content - -(* Open the HTML file using `read_file` and find the links using the `find_links` function. *) -let () = find_links (read_file "lib/webhtml/index.html") From e2db468a2fc8c8468dcb1c9d5675f8f20562aca7 Mon Sep 17 00:00:00 2001 From: Grant Smith <57376089+ggsmith842@users.noreply.github.com> Date: Thu, 4 Jul 2024 14:27:34 -0600 Subject: [PATCH 6/7] Update and rename 00-Re.ml to 00-lambdasoup.ml Changed from using regular expression to lambdasoup due to concerns raised about issues working with HTML using regular expressions. --- .../{00-Re.ml => 00-lambdasoup.ml} | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) rename data/cookbook/extract-links-from-html/{00-Re.ml => 00-lambdasoup.ml} (64%) diff --git a/data/cookbook/extract-links-from-html/00-Re.ml b/data/cookbook/extract-links-from-html/00-lambdasoup.ml similarity index 64% rename from data/cookbook/extract-links-from-html/00-Re.ml rename to data/cookbook/extract-links-from-html/00-lambdasoup.ml index bb8444050e..94ecb45fe9 100644 --- a/data/cookbook/extract-links-from-html/00-Re.ml +++ b/data/cookbook/extract-links-from-html/00-lambdasoup.ml @@ -1,9 +1,11 @@ --- packages: - - name: "re" - tested_version: "1.11.0" + - name: "lambdasoup" + tested_version: "1.0.0" used_libraries: - - re + - lambdasoup +discussion: | + - **Refernce:** The lambdasoup package provides a robust toolset for working with HTML. [github.com/lambdasoup](https://github.com/aantron/lambdasoup?tab=readme-ov-file) --- (* @@ -11,18 +13,16 @@ packages: `find_links` accepts an argument `html_content` of type string that contains our HTML content and returns the content of the `href` tags. -You can view the pattern using [Regex101](https://regex101.com/r/2Bs442/1) -to understand more about what is going on. +`parse` from the `Soup` library produces a document node representing the HTML string. -`Re.all` searches the entire `html_content` string for the `pattern`. Passing `1` to `Re.Group.get` returns the -substring versus the entire matching group. +`$$` selects the links in the document. *) -let find_links html_content = - let pattern = Re.compile (Re.Perl.re "]* href=\"([^\"]*)") in - let links = Re.all pattern html_content - |> List.map (fun group -> Re.Group.get group 1) in - List.iter print_endline links +open Soup + +let find_links html_content = + let document_node = Soup.parse html_content in + document_node $$ "a[href]" |> iter (fun a -> print_endline (R.attribute "href" a)) (* Example usage *) @@ -51,11 +51,9 @@ let html_content = " " (*Expected output: - https://ocaml.org/docs https://pola.rs/ https://www.nonexistentwebsite.com - *) let () = find_links html_content From d7ad4d81b42ee36226d46ffac5a973aae2a3e25b Mon Sep 17 00:00:00 2001 From: Grant Smith <57376089+ggsmith842@users.noreply.github.com> Date: Thu, 4 Jul 2024 15:06:00 -0600 Subject: [PATCH 7/7] Update 00-lambdasoup.ml change `Soup.parse` to `parse` update documentation on selector query `$$` --- data/cookbook/extract-links-from-html/00-lambdasoup.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/cookbook/extract-links-from-html/00-lambdasoup.ml b/data/cookbook/extract-links-from-html/00-lambdasoup.ml index 94ecb45fe9..abd7232707 100644 --- a/data/cookbook/extract-links-from-html/00-lambdasoup.ml +++ b/data/cookbook/extract-links-from-html/00-lambdasoup.ml @@ -15,13 +15,13 @@ the content of the `href` tags. `parse` from the `Soup` library produces a document node representing the HTML string. -`$$` selects the links in the document. +`$$` selects nodes in the document using the selector query. *) open Soup let find_links html_content = - let document_node = Soup.parse html_content in + let document_node = parse html_content in document_node $$ "a[href]" |> iter (fun a -> print_endline (R.attribute "href" a))