From 734b404c88646eb95d7652416fb0ff56d23b1069 Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 12:37:58 -0400 Subject: [PATCH 01/11] Provide formatting for examples Simple markdown adjustment for examples using `code` --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a4203ae..37ded66 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,11 @@ Due to the lack of useful native DOM parsers this class implements the HTMLClean ## Usage ## In this example we will fetch the og:title and og:type contents, while ignoring any errors if this page does not comply with the Open Graph protocol standard (set in the constructor via true) -> OpenGraph testPage = new OpenGraph("http://uk.rottentomatoes.com/m/1217700-kick_ass", true); +> `OpenGraph testPage = new OpenGraph("http://uk.rottentomatoes.com/m/1217700-kick_ass", true);` -> String title = testPage.getContent("title"); +> `String title = testPage.getContent("title");` -> String type = testPage.getContent("type"); +> `String type = testPage.getContent("type");` Another example (available in the examples/ folder) demonstrates the support for custom OpenGraph namespaces From 96a88112bab987f087919037de07d2abfa0b1a7e Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 12:43:19 -0400 Subject: [PATCH 02/11] Provide additional exclusions and organize git directory exclusions do not need `*` for depth, see [Pattern Format](http://git-scm.com/docs/gitignore#_pattern_format). Added exclusions for IntelliJ and Mac OS X. Some minor organization. --- .gitignore | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index e9aa7e9..3ba916c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,14 @@ #Ignore -bin/* -testreport/* -examples/backtothefuture/build/* -target/* +bin/ +testreport/ +examples/backtothefuture/build/ + +## Maven +target/ + +## IntelliJ +*.iml +.idea + +## Mac OS X +.DS_Store From 6f83b63a0a6b329378f9c51525d994260667a168 Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 13:30:07 -0400 Subject: [PATCH 03/11] Add License and update POM Source encoding. Compiler props. Add `licenses`. Add LICENSE.txt. --- LICENSE.txt | 28 ++++++++++++++++++ pom.xml | 85 +++++++++++++++++++++++++++++------------------------ 2 files changed, 75 insertions(+), 38 deletions(-) create mode 100644 LICENSE.txt diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..17d470f --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,28 @@ +Copyright (c) 2010-2021 John Deverall. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/pom.xml b/pom.xml index 3281a04..d9fe1e8 100644 --- a/pom.xml +++ b/pom.xml @@ -1,41 +1,50 @@ - 4.0.0 - OpenGraph - OpenGraph - 0.0.1-SNAPSHOT - A Facebook OpenGraph implementation for Java - OpenGraph for Java + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 + OpenGraph + OpenGraph + 0.0.2-SNAPSHOT + A Facebook OpenGraph implementation for Java + OpenGraph for Java - - - org.hamcrest - hamcrest-core - 1.3 - - - net.sourceforge.htmlcleaner - htmlcleaner - 2.16 - - - junit - junit - 4.12 - - - - - src - - - maven-compiler-plugin - 3.3 - - - - - - - + + 1.8 + 1.8 + UTF-8 + + + + org.hamcrest + hamcrest-core + 1.3 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.24 + + + junit + junit + 4.12 + + + + + BSD-3-Clause + https://opensource.org/licenses/BSD-3-Clause + Best guess license from repo comments. Compatible with the HTMLCleaner BSD-3-Clause license. + + manual + + + + src + + + maven-compiler-plugin + 3.8.1 + + + \ No newline at end of file From d9672da9d622e3ae5a59a7e79ce8a4943c7ea4d0 Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 13:31:51 -0400 Subject: [PATCH 04/11] Fix UTF characters from test page. --- src/test/main/java/org/opengraph/OpenGraphTest.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/test/main/java/org/opengraph/OpenGraphTest.java b/src/test/main/java/org/opengraph/OpenGraphTest.java index e8d5a19..0b44f47 100644 --- a/src/test/main/java/org/opengraph/OpenGraphTest.java +++ b/src/test/main/java/org/opengraph/OpenGraphTest.java @@ -1,15 +1,16 @@ package org.opengraph; -import org.junit.Test; -import org.opengraph.OpenGraph; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + -import static org.junit.Assert.*; +import org.junit.Test; public class OpenGraphTest { @Test public void shouldHandleMissingContentType() throws java.lang.Exception { OpenGraph site = new OpenGraph("http://www.bbc.com/future/story/20140428-the-myth-of-tech-revolutions", true); - assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); + assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); assertEquals("624", site.getContent("image:width")); } From 43a465960bcb4131ff924a79e51d44c15c29a3ac Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 13:38:50 -0400 Subject: [PATCH 05/11] Refactor test packages, remove defaults from POM Refactored the test packages to Maven defaults and removed the `sourceDirectory` element from the compiler plugin for a cleaner build. --- pom.xml | 1 - src/test/{main => }/java/org/opengraph/OpenGraphTest.java | 0 2 files changed, 1 deletion(-) rename src/test/{main => }/java/org/opengraph/OpenGraphTest.java (100%) diff --git a/pom.xml b/pom.xml index d9fe1e8..ca5503b 100644 --- a/pom.xml +++ b/pom.xml @@ -39,7 +39,6 @@ - src maven-compiler-plugin diff --git a/src/test/main/java/org/opengraph/OpenGraphTest.java b/src/test/java/org/opengraph/OpenGraphTest.java similarity index 100% rename from src/test/main/java/org/opengraph/OpenGraphTest.java rename to src/test/java/org/opengraph/OpenGraphTest.java From 2a28be2afbd417806bb566d15b57ba4f3b42604b Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 13:44:23 -0400 Subject: [PATCH 06/11] Fix tests using BBC BBC reorganized URIs and moved to HTTPS --- src/test/java/org/opengraph/OpenGraphTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/opengraph/OpenGraphTest.java b/src/test/java/org/opengraph/OpenGraphTest.java index 0b44f47..fd45453 100644 --- a/src/test/java/org/opengraph/OpenGraphTest.java +++ b/src/test/java/org/opengraph/OpenGraphTest.java @@ -9,7 +9,7 @@ public class OpenGraphTest { @Test public void shouldHandleMissingContentType() throws java.lang.Exception { - OpenGraph site = new OpenGraph("http://www.bbc.com/future/story/20140428-the-myth-of-tech-revolutions", true); + OpenGraph site = new OpenGraph("https://www.bbc.com/future/article/20140428-the-myth-of-tech-revolutions", true); assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); assertEquals("624", site.getContent("image:width")); } From b3ce502886ae8b6b9e24a4ee2cd44e17ad065b60 Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 14:07:16 -0400 Subject: [PATCH 07/11] Complete more POM meta-data Build out the POM meta-data. --- pom.xml | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/pom.xml b/pom.xml index ca5503b..5ef061a 100644 --- a/pom.xml +++ b/pom.xml @@ -38,9 +38,71 @@ manual + + + John Deverall + johndeverall@gmail.com + https://github.com/johndeverall + John Deverall + https://github.com/johndeverall + + developer + + Pacific/Auckland + + + + + Callum Jones + https://github.com/callumj + Callum Jones + https://github.com/callumj/ + + contributor + + America/Los_Angeles + + + Ruslan Khmelyuk + ruslan@khmelyuk.xyz + http://www.khmelyuk.com/ + Ruslan Khmelyuk + http://www.khmelyuk.com/ + + contributor + + America/Los_Angeles + + + Niall Kennedy + niall@niallkennedy.com + https://www.niallkennedy.com/blog/ + Niall Kennedy + https://www.niallkennedy.com/blog/ + + contributor + + America/Los_Angeles + + @niall + + + + Timothy Stone + javafueled@gmail.com + https://github.com/timothystone + Timothy Stone + https://www.anothercaffeinatedday.com/ + + contributor + + America/New_York + + + org.apache.maven.plugins maven-compiler-plugin 3.8.1 From 696740e8736709a4bdec348b6f39aeeab808ede4 Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 17:07:36 -0400 Subject: [PATCH 08/11] Refactor project for multi-module maven build A full refactoring of the project to take advantage of Maven. Refactoring includes naming, multi-module build, and more. Updated the URL for the BackToTheFuture example to use an archived version of the RottenTomatoes website at archive.org at the time of the original commit . Changes at RottenTomatoes prevented this from working. Maven GAV was refactored to use case normalization. The build creates an executable JAR (with dependencies). Update the SNAPSHOT version. --- examples/backtothefuture/build.xml | 72 ---- .../opengraph/examples}/BackToTheFuture.java | 0 examples/pom.xml | 55 +++ plugin/pom.xml | 24 ++ .../main/java/org/opengraph/MetaElement.java | 65 +++ .../main/java/org/opengraph/OpenGraph.java | 394 ++++++++++++++++++ .../org/opengraph/OpenGraphNamespace.java | 36 ++ .../java/org/opengraph/OpenGraphTest.java | 0 pom.xml | 17 +- 9 files changed, 588 insertions(+), 75 deletions(-) delete mode 100644 examples/backtothefuture/build.xml rename examples/backtothefuture/src/{ => main/java/org/opengraph/examples}/BackToTheFuture.java (100%) create mode 100644 examples/pom.xml create mode 100644 plugin/pom.xml create mode 100644 plugin/src/main/java/org/opengraph/MetaElement.java create mode 100644 plugin/src/main/java/org/opengraph/OpenGraph.java create mode 100644 plugin/src/main/java/org/opengraph/OpenGraphNamespace.java rename {src => plugin/src}/test/java/org/opengraph/OpenGraphTest.java (100%) diff --git a/examples/backtothefuture/build.xml b/examples/backtothefuture/build.xml deleted file mode 100644 index e8d4a48..0000000 --- a/examples/backtothefuture/build.xml +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/examples/backtothefuture/src/BackToTheFuture.java b/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java similarity index 100% rename from examples/backtothefuture/src/BackToTheFuture.java rename to examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java diff --git a/examples/pom.xml b/examples/pom.xml new file mode 100644 index 0000000..53dc844 --- /dev/null +++ b/examples/pom.xml @@ -0,0 +1,55 @@ + + 4.0.0 + opengraph-examples + ${project.parent.version} + jar + + + opengraph + opengraph + 0.0.2-SNAPSHOT + + + + + opengraph + opengraph-plugin + 0.0.2-SNAPSHOT + + + + backtothefuture/src/main/java + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + package + + single + + + + + org.opengraph.examples.BackToTheFuture + + + + jar-with-dependencies + + + + + + + + + \ No newline at end of file diff --git a/plugin/pom.xml b/plugin/pom.xml new file mode 100644 index 0000000..b917484 --- /dev/null +++ b/plugin/pom.xml @@ -0,0 +1,24 @@ + + 4.0.0 + opengraph-plugin + ${project.parent.version} + jar + + + opengraph + opengraph + 0.0.2-SNAPSHOT + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + + + \ No newline at end of file diff --git a/plugin/src/main/java/org/opengraph/MetaElement.java b/plugin/src/main/java/org/opengraph/MetaElement.java new file mode 100644 index 0000000..d7e8916 --- /dev/null +++ b/plugin/src/main/java/org/opengraph/MetaElement.java @@ -0,0 +1,65 @@ +package org.opengraph; + +import java.net.URL; + +/** + * Represents OpenGraph enabled meta data for a specific document + * + * @author Callum Jones + */ +public class MetaElement { + private OpenGraphNamespace namespace; //either "og" an NS specific + private String property; + private String content; + + /** + * Construct the representation of an element + * + * @param namespace The namespace the element belongs to + * @param property The property key + * @param content The content or value of this element + */ + public MetaElement(OpenGraphNamespace namespace, String property, String content) { + this.namespace = namespace; + this.property = property; + this.content = content; + } + + /** + * Fetch the content string of the element + */ + public String getContent() { + return content; + } + + /** + * Fetch the OpenGraph namespace + */ + public OpenGraphNamespace getNamespace() { + return namespace; + } + + /** + * Fetch the property of the element + */ + public String getProperty() { + return property; + } + + /** + * Fetch the OpenGraph data from the object + * + * @return If the content is a URL, then an attempted will be made to build OpenGraph data from the object + */ + public OpenGraph getExtendedData() { + //The Java language should know the best form of a URL + try { + URL url = new URL(getContent()); + + //success + return new OpenGraph(url.toString(), true); + } catch (Exception e) { + return null; //not a valid URL + } + } +} \ No newline at end of file diff --git a/plugin/src/main/java/org/opengraph/OpenGraph.java b/plugin/src/main/java/org/opengraph/OpenGraph.java new file mode 100644 index 0000000..3c02a0d --- /dev/null +++ b/plugin/src/main/java/org/opengraph/OpenGraph.java @@ -0,0 +1,394 @@ +package org.opengraph; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; + +/** + * A Java object representation of an Open Graph enabled webpage. + * A simplified layer over a Hastable. + * + * @author Callum Jones + */ +public class OpenGraph { + private String pageUrl; + private ArrayList pageNamespaces; + private Hashtable> metaAttributes; + private String baseType; + private boolean isImported; // determine if the object is a new incarnation or representation of a web page + private boolean hasChanged; // track if object has been changed + + public final static String[] REQUIRED_META = new String[] {"title", "type", "image", "url"}; + + public final static Hashtable BASE_TYPES = new Hashtable(); + + static { + BASE_TYPES.put("activity", new String[] {"activity", "sport"}); + BASE_TYPES.put("business", new String[] {"bar", "company", "cafe", "hotel", "restaurant"}); + BASE_TYPES.put("group", new String[] {"cause", "sports_league", "sports_team"}); + BASE_TYPES.put("organization", new String[] {"band", "government", "non_profit", "school", "university"}); + BASE_TYPES.put("person", + new String[] {"actor", "athlete", "author", "director", "musician", "politician", "profile", "public_figure"}); + BASE_TYPES.put("place", new String[] {"city", "country", "landmark", "state_province"}); + BASE_TYPES + .put("product", new String[] {"album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show"}); + BASE_TYPES.put("website", new String[] {"blog", "website", "article"}); + } + + /** + * Create an open graph representation for generating your own Open Graph object + */ + public OpenGraph() { + pageNamespaces = new ArrayList(); + metaAttributes = new Hashtable>(); + hasChanged = false; + isImported = false; + } + + /** + * Fetch the open graph representation from a web site + * + * @param url The address to the web page to fetch Open Graph data + * @param ignoreSpecErrors Set this option to true if you don't wish to have an exception throw if the page does not conform to the basic 4 attributes + * @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception + * @throws java.lang.Exception A generic exception is throw if the specific page fails to conform to the basic Open Graph standard as define by the constant REQUIRED_META + */ + public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOException, Exception { + this(); + isImported = true; + + + // download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content + URL pageURL = new URL(url); + URLConnection siteConnection = pageURL.openConnection(); + Charset charset = getConnectionCharset(siteConnection); + BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset)); + String inputLine; + StringBuffer headContents = new StringBuffer(); + + // Loop through each line, looking for the closing head element + while ((inputLine = dis.readLine()) != null) { + if (inputLine.contains("")) { + inputLine = inputLine.substring(0, inputLine.indexOf("") + 7); + inputLine = inputLine.concat(""); + headContents.append(inputLine + "\r\n"); + break; + } + headContents.append(inputLine + "\r\n"); + } + + String headContentsStr = headContents.toString(); + HtmlCleaner cleaner = new HtmlCleaner(); + // parse the string HTML + TagNode pageData = cleaner.clean(headContentsStr); + + // read in the declared namespaces + boolean hasOGspec = false; + TagNode headElement = pageData.findElementByName("head", true); + if (headElement.hasAttribute("prefix")) { + String namespaceData = headElement.getAttributeByName("prefix"); + Pattern pattern = Pattern.compile("(([A-Za-z0-9_]+):\\s+(http:\\/\\/ogp.me\\/ns(\\/\\w+)*#))\\s*"); + Matcher matcher = pattern.matcher(namespaceData); + while (matcher.find()) { + String prefix = matcher.group(2); + String documentURI = matcher.group(3); + pageNamespaces.add(new OpenGraphNamespace(prefix, documentURI)); + if (prefix.equals("og")) { + hasOGspec = true; + } + } + } + + // some pages do not include the new OG spec + // this fixes compatibility + if (!hasOGspec) { + pageNamespaces.add(new OpenGraphNamespace("og", "http:// ogp.me/ns#")); + } + + // open only the meta tags + TagNode[] metaData = pageData.getElementsByName("meta", true); + for (TagNode metaElement : metaData) { + for (OpenGraphNamespace namespace : pageNamespaces) { + String target = null; + if (metaElement.hasAttribute("property")) { + target = "property"; + } else if (metaElement.hasAttribute("name")) { + target = "name"; + } + + if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":")) { + setProperty(namespace, metaElement.getAttributeByName(target), metaElement.getAttributeByName("content")); + break; + } + } + } + + /** + * Check that page conforms to Open Graph protocol + */ + if (!ignoreSpecErrors) { + for (String req : REQUIRED_META) { + if (!metaAttributes.containsKey(req)) { + throw new Exception("Does not conform to Open Graph protocol"); + } + } + } + + /** + * Has conformed, now determine basic sub type. + */ + baseType = null; + String currentType = getContent("type"); + // some apps use their OG namespace as a prefix + if (currentType != null) { + for (OpenGraphNamespace ns : pageNamespaces) { + if (currentType.startsWith(ns.getPrefix() + ":")) { + currentType = currentType.replaceFirst(ns.getPrefix() + ":", ""); + break; // done here + } + } + } + for (String base : BASE_TYPES.keySet()) { + String[] baseList = BASE_TYPES.get(base); + boolean finished = false; + for (String expandedType : baseList) { + if (expandedType.equals(currentType)) { + baseType = base; + finished = true; + break; + } + } + if (finished) { + break; + } + } + + // read the original page url + URL realURL = siteConnection.getURL(); + pageUrl = realURL.toExternalForm(); + } + + /** + * Gets the charset for specified connection. + * Content Type header is parsed to get the charset name. + * + * @param connection the connection. + * @return the Charset object for response charset name; + * if it's not found then the default charset. + */ + private static Charset getConnectionCharset(URLConnection connection) { + String contentType = connection.getContentType(); + if (contentType != null && contentType.length() > 0) { + contentType = contentType.toLowerCase(); + String charsetName = extractCharsetName(contentType); + if (charsetName != null && charsetName.length() > 0) { + try { + return Charset.forName(charsetName); + } catch (Exception e) { + // specified charset is not found, + // skip it to return the default one + } + } + } + + // return the default charset + return Charset.defaultCharset(); + } + + /** + * Extract the charset name form the content type string. + * Content type string is received from Content-Type header. + * + * @param contentType the content type string, must be not null. + * @return the found charset name or null if not found. + */ + private static String extractCharsetName(String contentType) { + // split onto media types + final String[] mediaTypes = contentType.split(":"); + if (mediaTypes.length > 0) { + // use only the first one, and split it on parameters + final String[] params = mediaTypes[0].split(";"); + + // find the charset parameter and return it's value + for (String each : params) { + each = each.trim(); + if (each.startsWith("charset=")) { + // return the charset name + return each.substring(8).trim(); + } + } + } + + return null; + } + + /** + * Get the basic type of the Open graph page as per the specification + * + * @return Base type as defined by specification, null otherwise + */ + public String getBaseType() { + return baseType; + } + + /** + * Get a value of a given Open Graph property + * + * @param property The Open graph property key + * @return Returns the value of the first property defined, null otherwise + */ + public String getContent(String property) { + if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0) { + return metaAttributes.get(property).get(0).getContent(); + } else { + return null; + } + } + + /** + * Get all the defined properties of the Open Graph object + * + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties() { + ArrayList allElements = new ArrayList(); + for (ArrayList collection : metaAttributes.values()) { + allElements.addAll(collection); + } + + return (MetaElement[]) allElements.toArray(new MetaElement[allElements.size()]); + } + + /** + * Get all the defined properties of the Open Graph object + * + * @param property The property to focus on + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties(String property) { + if (metaAttributes.containsKey(property)) { + ArrayList target = metaAttributes.get(property); + return (MetaElement[]) target.toArray(new MetaElement[target.size()]); + } else { + return null; + } + } + + /** + * Get the original URL the Open Graph page was obtained from + * + * @return The address to the Open Graph object page + */ + public String getOriginalUrl() { + return pageUrl; + } + + + /** + * Get the HTML representation of the Open Graph data. + * + * @return An array of meta elements as Strings + */ + public String[] toHTML() { + // allocate the array + ArrayList returnHTML = new ArrayList(); + + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { + for (MetaElement element : elements) { + returnHTML.add(""); + } + } + + // return the array + return (String[]) returnHTML.toArray(); + } + + /** + * Get the XHTML representation of the Open Graph data. + * + * @return An array of meta elements as Strings + */ + public String[] toXHTML() { + // allocate the array + ArrayList returnHTML = new ArrayList(); + + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { + for (MetaElement element : elements) { + returnHTML.add(""); + } + } + + // return the array + return (String[]) returnHTML.toArray(); + } + + /** + * Set the Open Graph property to a specific value + * + * @param namespace The OpenGraph namespace the content belongs to + * @param property The og:XXXX where XXXX is the property you wish to set + * @param content The value or contents of the property to be set + */ + public void setProperty(OpenGraphNamespace namespace, String property, String content) { + if (!pageNamespaces.contains(namespace)) { + pageNamespaces.add(namespace); + } + + property = property.replaceAll(namespace.getPrefix() + ":", ""); + MetaElement element = new MetaElement(namespace, property, content); + if (!metaAttributes.containsKey(property)) { + metaAttributes.put(property, new ArrayList()); + } + + metaAttributes.get(property).add(element); + } + + /** + * Removed a defined property + * + * @param property The og:XXXX where XXXX is the property you wish to remove + */ + public void removeProperty(String property) { + metaAttributes.remove(property); + } + + /** + * Obtain the underlying HashTable + * + * @return The underlying structure as a Hashtable + */ + public Hashtable> exposeTable() { + return metaAttributes; + } + + /** + * Test if the Open Graph object was initially a representation of a web page + * + * @return True if the object is from a web page, false otherwise + */ + public boolean isFromWeb() { + return isImported; + } + + /** + * Test if the object has been modified by setters/deleters. + * This is only relevant if this object initially represented a web page + * + * @return True True if the object has been modified, false otherwise + */ + public boolean hasChanged() { + return hasChanged; + } +} diff --git a/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java b/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java new file mode 100644 index 0000000..64643f4 --- /dev/null +++ b/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java @@ -0,0 +1,36 @@ +package org.opengraph; + +/** + * Represents an OpenGraph namespace + * + * @author Callum Jones + */ +public class OpenGraphNamespace { + private String prefix; + private String schemaURI; + + /** + * Construct a namespace + * + * @param prefix The OpenGraph assigned namespace prefix such as og or og_appname + * @param schemaURI The URL for the OpenGraph schema + */ + public OpenGraphNamespace(String prefix, String schemaURI) { + this.prefix = prefix; + this.schemaURI = schemaURI; + } + + /* + * Fetch the prefix used for the namespace + */ + public String getPrefix() { + return prefix; + } + + /* + * Fetch the address for the schema reference + */ + public String getSchemaURI() { + return schemaURI; + } +} \ No newline at end of file diff --git a/src/test/java/org/opengraph/OpenGraphTest.java b/plugin/src/test/java/org/opengraph/OpenGraphTest.java similarity index 100% rename from src/test/java/org/opengraph/OpenGraphTest.java rename to plugin/src/test/java/org/opengraph/OpenGraphTest.java diff --git a/pom.xml b/pom.xml index 5ef061a..d1c6d97 100644 --- a/pom.xml +++ b/pom.xml @@ -1,9 +1,10 @@ 4.0.0 - OpenGraph - OpenGraph + opengraph + opengraph 0.0.2-SNAPSHOT + pom A Facebook OpenGraph implementation for Java OpenGraph for Java @@ -12,6 +13,11 @@ 1.8 UTF-8 + + + plugin + examples + org.hamcrest @@ -26,9 +32,10 @@ junit junit - 4.12 + 4.13.1 + BSD-3-Clause @@ -38,6 +45,7 @@ manual + John Deverall @@ -51,6 +59,7 @@ Pacific/Auckland + Callum Jones @@ -99,6 +108,7 @@ America/New_York + @@ -108,4 +118,5 @@ + \ No newline at end of file From 88d988ee814d0d47c3831b52770921642c309104 Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 17:09:34 -0400 Subject: [PATCH 09/11] Fix the RottenTomatoes link Use the archive.org version for the movie at the time that the class was first committed to the project. --- .../org/opengraph/examples/BackToTheFuture.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java b/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java index 6bd13e7..c822b26 100644 --- a/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java +++ b/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java @@ -1,19 +1,16 @@ package org.opengraph.examples; -import org.opengraph.OpenGraph; import org.opengraph.MetaElement; +import org.opengraph.OpenGraph; public class BackToTheFuture { - static String uri = "http://www.rottentomatoes.com/m/back_to_the_future/"; - - public static void main(String [] args) - { - try - { + static String uri = "https://web.archive.org/web/20110924151516/https://www.rottentomatoes.com/m/back_to_the_future"; + + public static void main(String[] args) { + try { OpenGraph movie = new OpenGraph(uri, true); System.out.println("Movie: " + movie.getContent("title")); - for (MetaElement director : movie.getProperties("director")) - { + for (MetaElement director : movie.getProperties("director")) { OpenGraph extendedInfo = director.getExtendedData(); System.out.println("Directed by: " + extendedInfo.getContent("title")); } From a5737958c75b14320264b0b0c095c91f36f09436 Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Sun, 9 May 2021 12:37:58 -0400 Subject: [PATCH 10/11] Fully refactor the project for Maven As shown below, numerous refactorings took place. The refactoring builds a multi-module maven project and sets up the project for leveraging additional features of maven. Provide formatting for examples Simple markdown adjustment for examples using `code` Provide additional exclusions and organize git directory exclusions do not need `*` for depth, see [Pattern Format](http://git-scm.com/docs/gitignore#_pattern_format). Added exclusions for IntelliJ and Mac OS X. Some minor organization. Add License and update POM Source encoding. Compiler props. Add `licenses`. Add LICENSE.txt. Fix UTF characters from test page. Refactor test packages, remove defaults from POM Refactored the test packages to Maven defaults and removed the `sourceDirectory` element from the compiler plugin for a cleaner build. Fix tests using BBC BBC reorganized URIs and moved to HTTPS Complete more POM meta-data Build out the POM meta-data. Refactor project for multi-module maven build A full refactoring of the project to take advantage of Maven. Refactoring includes naming, multi-module build, and more. Updated the URL for the BackToTheFuture example to use an archived version of the RottenTomatoes website at archive.org at the time of the original commit . Changes at RottenTomatoes prevented this from working. Maven GAV was refactored to use case normalization. The build creates an executable JAR (with dependencies). Update the SNAPSHOT version. Fix the RottenTomatoes link Use the archive.org version for the movie at the time that the class was first committed to the project. --- .gitignore | 17 +- LICENSE.txt | 28 ++ README.md | 6 +- examples/backtothefuture/build.xml | 72 ---- .../opengraph/examples}/BackToTheFuture.java | 15 +- examples/pom.xml | 55 +++ plugin/pom.xml | 24 ++ .../main/java/org/opengraph/MetaElement.java | 65 +++ .../main/java/org/opengraph/OpenGraph.java | 394 ++++++++++++++++++ .../org/opengraph/OpenGraphNamespace.java | 36 ++ .../java/org/opengraph/OpenGraphTest.java | 11 +- pom.xml | 155 +++++-- 12 files changed, 748 insertions(+), 130 deletions(-) create mode 100644 LICENSE.txt delete mode 100644 examples/backtothefuture/build.xml rename examples/backtothefuture/src/{ => main/java/org/opengraph/examples}/BackToTheFuture.java (73%) create mode 100644 examples/pom.xml create mode 100644 plugin/pom.xml create mode 100644 plugin/src/main/java/org/opengraph/MetaElement.java create mode 100644 plugin/src/main/java/org/opengraph/OpenGraph.java create mode 100644 plugin/src/main/java/org/opengraph/OpenGraphNamespace.java rename {src/test/main => plugin/src/test}/java/org/opengraph/OpenGraphTest.java (57%) diff --git a/.gitignore b/.gitignore index e9aa7e9..3ba916c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,14 @@ #Ignore -bin/* -testreport/* -examples/backtothefuture/build/* -target/* +bin/ +testreport/ +examples/backtothefuture/build/ + +## Maven +target/ + +## IntelliJ +*.iml +.idea + +## Mac OS X +.DS_Store diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..17d470f --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,28 @@ +Copyright (c) 2010-2021 John Deverall. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README.md b/README.md index a4203ae..37ded66 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,11 @@ Due to the lack of useful native DOM parsers this class implements the HTMLClean ## Usage ## In this example we will fetch the og:title and og:type contents, while ignoring any errors if this page does not comply with the Open Graph protocol standard (set in the constructor via true) -> OpenGraph testPage = new OpenGraph("http://uk.rottentomatoes.com/m/1217700-kick_ass", true); +> `OpenGraph testPage = new OpenGraph("http://uk.rottentomatoes.com/m/1217700-kick_ass", true);` -> String title = testPage.getContent("title"); +> `String title = testPage.getContent("title");` -> String type = testPage.getContent("type"); +> `String type = testPage.getContent("type");` Another example (available in the examples/ folder) demonstrates the support for custom OpenGraph namespaces diff --git a/examples/backtothefuture/build.xml b/examples/backtothefuture/build.xml deleted file mode 100644 index e8d4a48..0000000 --- a/examples/backtothefuture/build.xml +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/examples/backtothefuture/src/BackToTheFuture.java b/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java similarity index 73% rename from examples/backtothefuture/src/BackToTheFuture.java rename to examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java index 6bd13e7..c822b26 100644 --- a/examples/backtothefuture/src/BackToTheFuture.java +++ b/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java @@ -1,19 +1,16 @@ package org.opengraph.examples; -import org.opengraph.OpenGraph; import org.opengraph.MetaElement; +import org.opengraph.OpenGraph; public class BackToTheFuture { - static String uri = "http://www.rottentomatoes.com/m/back_to_the_future/"; - - public static void main(String [] args) - { - try - { + static String uri = "https://web.archive.org/web/20110924151516/https://www.rottentomatoes.com/m/back_to_the_future"; + + public static void main(String[] args) { + try { OpenGraph movie = new OpenGraph(uri, true); System.out.println("Movie: " + movie.getContent("title")); - for (MetaElement director : movie.getProperties("director")) - { + for (MetaElement director : movie.getProperties("director")) { OpenGraph extendedInfo = director.getExtendedData(); System.out.println("Directed by: " + extendedInfo.getContent("title")); } diff --git a/examples/pom.xml b/examples/pom.xml new file mode 100644 index 0000000..53dc844 --- /dev/null +++ b/examples/pom.xml @@ -0,0 +1,55 @@ + + 4.0.0 + opengraph-examples + ${project.parent.version} + jar + + + opengraph + opengraph + 0.0.2-SNAPSHOT + + + + + opengraph + opengraph-plugin + 0.0.2-SNAPSHOT + + + + backtothefuture/src/main/java + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + package + + single + + + + + org.opengraph.examples.BackToTheFuture + + + + jar-with-dependencies + + + + + + + + + \ No newline at end of file diff --git a/plugin/pom.xml b/plugin/pom.xml new file mode 100644 index 0000000..b917484 --- /dev/null +++ b/plugin/pom.xml @@ -0,0 +1,24 @@ + + 4.0.0 + opengraph-plugin + ${project.parent.version} + jar + + + opengraph + opengraph + 0.0.2-SNAPSHOT + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + + + \ No newline at end of file diff --git a/plugin/src/main/java/org/opengraph/MetaElement.java b/plugin/src/main/java/org/opengraph/MetaElement.java new file mode 100644 index 0000000..d7e8916 --- /dev/null +++ b/plugin/src/main/java/org/opengraph/MetaElement.java @@ -0,0 +1,65 @@ +package org.opengraph; + +import java.net.URL; + +/** + * Represents OpenGraph enabled meta data for a specific document + * + * @author Callum Jones + */ +public class MetaElement { + private OpenGraphNamespace namespace; //either "og" an NS specific + private String property; + private String content; + + /** + * Construct the representation of an element + * + * @param namespace The namespace the element belongs to + * @param property The property key + * @param content The content or value of this element + */ + public MetaElement(OpenGraphNamespace namespace, String property, String content) { + this.namespace = namespace; + this.property = property; + this.content = content; + } + + /** + * Fetch the content string of the element + */ + public String getContent() { + return content; + } + + /** + * Fetch the OpenGraph namespace + */ + public OpenGraphNamespace getNamespace() { + return namespace; + } + + /** + * Fetch the property of the element + */ + public String getProperty() { + return property; + } + + /** + * Fetch the OpenGraph data from the object + * + * @return If the content is a URL, then an attempted will be made to build OpenGraph data from the object + */ + public OpenGraph getExtendedData() { + //The Java language should know the best form of a URL + try { + URL url = new URL(getContent()); + + //success + return new OpenGraph(url.toString(), true); + } catch (Exception e) { + return null; //not a valid URL + } + } +} \ No newline at end of file diff --git a/plugin/src/main/java/org/opengraph/OpenGraph.java b/plugin/src/main/java/org/opengraph/OpenGraph.java new file mode 100644 index 0000000..3c02a0d --- /dev/null +++ b/plugin/src/main/java/org/opengraph/OpenGraph.java @@ -0,0 +1,394 @@ +package org.opengraph; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; + +/** + * A Java object representation of an Open Graph enabled webpage. + * A simplified layer over a Hastable. + * + * @author Callum Jones + */ +public class OpenGraph { + private String pageUrl; + private ArrayList pageNamespaces; + private Hashtable> metaAttributes; + private String baseType; + private boolean isImported; // determine if the object is a new incarnation or representation of a web page + private boolean hasChanged; // track if object has been changed + + public final static String[] REQUIRED_META = new String[] {"title", "type", "image", "url"}; + + public final static Hashtable BASE_TYPES = new Hashtable(); + + static { + BASE_TYPES.put("activity", new String[] {"activity", "sport"}); + BASE_TYPES.put("business", new String[] {"bar", "company", "cafe", "hotel", "restaurant"}); + BASE_TYPES.put("group", new String[] {"cause", "sports_league", "sports_team"}); + BASE_TYPES.put("organization", new String[] {"band", "government", "non_profit", "school", "university"}); + BASE_TYPES.put("person", + new String[] {"actor", "athlete", "author", "director", "musician", "politician", "profile", "public_figure"}); + BASE_TYPES.put("place", new String[] {"city", "country", "landmark", "state_province"}); + BASE_TYPES + .put("product", new String[] {"album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show"}); + BASE_TYPES.put("website", new String[] {"blog", "website", "article"}); + } + + /** + * Create an open graph representation for generating your own Open Graph object + */ + public OpenGraph() { + pageNamespaces = new ArrayList(); + metaAttributes = new Hashtable>(); + hasChanged = false; + isImported = false; + } + + /** + * Fetch the open graph representation from a web site + * + * @param url The address to the web page to fetch Open Graph data + * @param ignoreSpecErrors Set this option to true if you don't wish to have an exception throw if the page does not conform to the basic 4 attributes + * @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception + * @throws java.lang.Exception A generic exception is throw if the specific page fails to conform to the basic Open Graph standard as define by the constant REQUIRED_META + */ + public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOException, Exception { + this(); + isImported = true; + + + // download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content + URL pageURL = new URL(url); + URLConnection siteConnection = pageURL.openConnection(); + Charset charset = getConnectionCharset(siteConnection); + BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset)); + String inputLine; + StringBuffer headContents = new StringBuffer(); + + // Loop through each line, looking for the closing head element + while ((inputLine = dis.readLine()) != null) { + if (inputLine.contains("")) { + inputLine = inputLine.substring(0, inputLine.indexOf("") + 7); + inputLine = inputLine.concat(""); + headContents.append(inputLine + "\r\n"); + break; + } + headContents.append(inputLine + "\r\n"); + } + + String headContentsStr = headContents.toString(); + HtmlCleaner cleaner = new HtmlCleaner(); + // parse the string HTML + TagNode pageData = cleaner.clean(headContentsStr); + + // read in the declared namespaces + boolean hasOGspec = false; + TagNode headElement = pageData.findElementByName("head", true); + if (headElement.hasAttribute("prefix")) { + String namespaceData = headElement.getAttributeByName("prefix"); + Pattern pattern = Pattern.compile("(([A-Za-z0-9_]+):\\s+(http:\\/\\/ogp.me\\/ns(\\/\\w+)*#))\\s*"); + Matcher matcher = pattern.matcher(namespaceData); + while (matcher.find()) { + String prefix = matcher.group(2); + String documentURI = matcher.group(3); + pageNamespaces.add(new OpenGraphNamespace(prefix, documentURI)); + if (prefix.equals("og")) { + hasOGspec = true; + } + } + } + + // some pages do not include the new OG spec + // this fixes compatibility + if (!hasOGspec) { + pageNamespaces.add(new OpenGraphNamespace("og", "http:// ogp.me/ns#")); + } + + // open only the meta tags + TagNode[] metaData = pageData.getElementsByName("meta", true); + for (TagNode metaElement : metaData) { + for (OpenGraphNamespace namespace : pageNamespaces) { + String target = null; + if (metaElement.hasAttribute("property")) { + target = "property"; + } else if (metaElement.hasAttribute("name")) { + target = "name"; + } + + if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":")) { + setProperty(namespace, metaElement.getAttributeByName(target), metaElement.getAttributeByName("content")); + break; + } + } + } + + /** + * Check that page conforms to Open Graph protocol + */ + if (!ignoreSpecErrors) { + for (String req : REQUIRED_META) { + if (!metaAttributes.containsKey(req)) { + throw new Exception("Does not conform to Open Graph protocol"); + } + } + } + + /** + * Has conformed, now determine basic sub type. + */ + baseType = null; + String currentType = getContent("type"); + // some apps use their OG namespace as a prefix + if (currentType != null) { + for (OpenGraphNamespace ns : pageNamespaces) { + if (currentType.startsWith(ns.getPrefix() + ":")) { + currentType = currentType.replaceFirst(ns.getPrefix() + ":", ""); + break; // done here + } + } + } + for (String base : BASE_TYPES.keySet()) { + String[] baseList = BASE_TYPES.get(base); + boolean finished = false; + for (String expandedType : baseList) { + if (expandedType.equals(currentType)) { + baseType = base; + finished = true; + break; + } + } + if (finished) { + break; + } + } + + // read the original page url + URL realURL = siteConnection.getURL(); + pageUrl = realURL.toExternalForm(); + } + + /** + * Gets the charset for specified connection. + * Content Type header is parsed to get the charset name. + * + * @param connection the connection. + * @return the Charset object for response charset name; + * if it's not found then the default charset. + */ + private static Charset getConnectionCharset(URLConnection connection) { + String contentType = connection.getContentType(); + if (contentType != null && contentType.length() > 0) { + contentType = contentType.toLowerCase(); + String charsetName = extractCharsetName(contentType); + if (charsetName != null && charsetName.length() > 0) { + try { + return Charset.forName(charsetName); + } catch (Exception e) { + // specified charset is not found, + // skip it to return the default one + } + } + } + + // return the default charset + return Charset.defaultCharset(); + } + + /** + * Extract the charset name form the content type string. + * Content type string is received from Content-Type header. + * + * @param contentType the content type string, must be not null. + * @return the found charset name or null if not found. + */ + private static String extractCharsetName(String contentType) { + // split onto media types + final String[] mediaTypes = contentType.split(":"); + if (mediaTypes.length > 0) { + // use only the first one, and split it on parameters + final String[] params = mediaTypes[0].split(";"); + + // find the charset parameter and return it's value + for (String each : params) { + each = each.trim(); + if (each.startsWith("charset=")) { + // return the charset name + return each.substring(8).trim(); + } + } + } + + return null; + } + + /** + * Get the basic type of the Open graph page as per the specification + * + * @return Base type as defined by specification, null otherwise + */ + public String getBaseType() { + return baseType; + } + + /** + * Get a value of a given Open Graph property + * + * @param property The Open graph property key + * @return Returns the value of the first property defined, null otherwise + */ + public String getContent(String property) { + if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0) { + return metaAttributes.get(property).get(0).getContent(); + } else { + return null; + } + } + + /** + * Get all the defined properties of the Open Graph object + * + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties() { + ArrayList allElements = new ArrayList(); + for (ArrayList collection : metaAttributes.values()) { + allElements.addAll(collection); + } + + return (MetaElement[]) allElements.toArray(new MetaElement[allElements.size()]); + } + + /** + * Get all the defined properties of the Open Graph object + * + * @param property The property to focus on + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties(String property) { + if (metaAttributes.containsKey(property)) { + ArrayList target = metaAttributes.get(property); + return (MetaElement[]) target.toArray(new MetaElement[target.size()]); + } else { + return null; + } + } + + /** + * Get the original URL the Open Graph page was obtained from + * + * @return The address to the Open Graph object page + */ + public String getOriginalUrl() { + return pageUrl; + } + + + /** + * Get the HTML representation of the Open Graph data. + * + * @return An array of meta elements as Strings + */ + public String[] toHTML() { + // allocate the array + ArrayList returnHTML = new ArrayList(); + + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { + for (MetaElement element : elements) { + returnHTML.add(""); + } + } + + // return the array + return (String[]) returnHTML.toArray(); + } + + /** + * Get the XHTML representation of the Open Graph data. + * + * @return An array of meta elements as Strings + */ + public String[] toXHTML() { + // allocate the array + ArrayList returnHTML = new ArrayList(); + + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { + for (MetaElement element : elements) { + returnHTML.add(""); + } + } + + // return the array + return (String[]) returnHTML.toArray(); + } + + /** + * Set the Open Graph property to a specific value + * + * @param namespace The OpenGraph namespace the content belongs to + * @param property The og:XXXX where XXXX is the property you wish to set + * @param content The value or contents of the property to be set + */ + public void setProperty(OpenGraphNamespace namespace, String property, String content) { + if (!pageNamespaces.contains(namespace)) { + pageNamespaces.add(namespace); + } + + property = property.replaceAll(namespace.getPrefix() + ":", ""); + MetaElement element = new MetaElement(namespace, property, content); + if (!metaAttributes.containsKey(property)) { + metaAttributes.put(property, new ArrayList()); + } + + metaAttributes.get(property).add(element); + } + + /** + * Removed a defined property + * + * @param property The og:XXXX where XXXX is the property you wish to remove + */ + public void removeProperty(String property) { + metaAttributes.remove(property); + } + + /** + * Obtain the underlying HashTable + * + * @return The underlying structure as a Hashtable + */ + public Hashtable> exposeTable() { + return metaAttributes; + } + + /** + * Test if the Open Graph object was initially a representation of a web page + * + * @return True if the object is from a web page, false otherwise + */ + public boolean isFromWeb() { + return isImported; + } + + /** + * Test if the object has been modified by setters/deleters. + * This is only relevant if this object initially represented a web page + * + * @return True True if the object has been modified, false otherwise + */ + public boolean hasChanged() { + return hasChanged; + } +} diff --git a/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java b/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java new file mode 100644 index 0000000..64643f4 --- /dev/null +++ b/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java @@ -0,0 +1,36 @@ +package org.opengraph; + +/** + * Represents an OpenGraph namespace + * + * @author Callum Jones + */ +public class OpenGraphNamespace { + private String prefix; + private String schemaURI; + + /** + * Construct a namespace + * + * @param prefix The OpenGraph assigned namespace prefix such as og or og_appname + * @param schemaURI The URL for the OpenGraph schema + */ + public OpenGraphNamespace(String prefix, String schemaURI) { + this.prefix = prefix; + this.schemaURI = schemaURI; + } + + /* + * Fetch the prefix used for the namespace + */ + public String getPrefix() { + return prefix; + } + + /* + * Fetch the address for the schema reference + */ + public String getSchemaURI() { + return schemaURI; + } +} \ No newline at end of file diff --git a/src/test/main/java/org/opengraph/OpenGraphTest.java b/plugin/src/test/java/org/opengraph/OpenGraphTest.java similarity index 57% rename from src/test/main/java/org/opengraph/OpenGraphTest.java rename to plugin/src/test/java/org/opengraph/OpenGraphTest.java index e8d5a19..fd45453 100644 --- a/src/test/main/java/org/opengraph/OpenGraphTest.java +++ b/plugin/src/test/java/org/opengraph/OpenGraphTest.java @@ -1,15 +1,16 @@ package org.opengraph; -import org.junit.Test; -import org.opengraph.OpenGraph; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + -import static org.junit.Assert.*; +import org.junit.Test; public class OpenGraphTest { @Test public void shouldHandleMissingContentType() throws java.lang.Exception { - OpenGraph site = new OpenGraph("http://www.bbc.com/future/story/20140428-the-myth-of-tech-revolutions", true); - assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); + OpenGraph site = new OpenGraph("https://www.bbc.com/future/article/20140428-the-myth-of-tech-revolutions", true); + assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); assertEquals("624", site.getContent("image:width")); } diff --git a/pom.xml b/pom.xml index 3281a04..d1c6d97 100644 --- a/pom.xml +++ b/pom.xml @@ -1,41 +1,122 @@ - 4.0.0 - OpenGraph - OpenGraph - 0.0.1-SNAPSHOT - A Facebook OpenGraph implementation for Java - OpenGraph for Java + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 + opengraph + opengraph + 0.0.2-SNAPSHOT + pom + A Facebook OpenGraph implementation for Java + OpenGraph for Java - - - org.hamcrest - hamcrest-core - 1.3 - - - net.sourceforge.htmlcleaner - htmlcleaner - 2.16 - - - junit - junit - 4.12 - - + + 1.8 + 1.8 + UTF-8 + + + + plugin + examples + + + + org.hamcrest + hamcrest-core + 1.3 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.24 + + + junit + junit + 4.13.1 + + + + + + BSD-3-Clause + https://opensource.org/licenses/BSD-3-Clause + Best guess license from repo comments. Compatible with the HTMLCleaner BSD-3-Clause license. + + manual + + + + + + John Deverall + johndeverall@gmail.com + https://github.com/johndeverall + John Deverall + https://github.com/johndeverall + + developer + + Pacific/Auckland + + + + + + Callum Jones + https://github.com/callumj + Callum Jones + https://github.com/callumj/ + + contributor + + America/Los_Angeles + + + Ruslan Khmelyuk + ruslan@khmelyuk.xyz + http://www.khmelyuk.com/ + Ruslan Khmelyuk + http://www.khmelyuk.com/ + + contributor + + America/Los_Angeles + + + Niall Kennedy + niall@niallkennedy.com + https://www.niallkennedy.com/blog/ + Niall Kennedy + https://www.niallkennedy.com/blog/ + + contributor + + America/Los_Angeles + + @niall + + + + Timothy Stone + javafueled@gmail.com + https://github.com/timothystone + Timothy Stone + https://www.anothercaffeinatedday.com/ + + contributor + + America/New_York + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + - - src - - - maven-compiler-plugin - 3.3 - - - - - - - \ No newline at end of file From 63f10d172fb4e988c579bc5477594ca75014a455 Mon Sep 17 00:00:00 2001 From: Timothy Stone Date: Thu, 5 Aug 2021 09:02:29 -0400 Subject: [PATCH 11/11] Commit WIP --- .../src/main/java/org/opengraph/OpenGraph.java | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/plugin/src/main/java/org/opengraph/OpenGraph.java b/plugin/src/main/java/org/opengraph/OpenGraph.java index 3c02a0d..766a932 100644 --- a/plugin/src/main/java/org/opengraph/OpenGraph.java +++ b/plugin/src/main/java/org/opengraph/OpenGraph.java @@ -2,8 +2,8 @@ import java.io.BufferedReader; import java.io.InputStreamReader; +import java.net.HttpURLConnection; import java.net.URL; -import java.net.URLConnection; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Hashtable; @@ -67,8 +67,18 @@ public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOExceptio // download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content + System.out.println(url); URL pageURL = new URL(url); - URLConnection siteConnection = pageURL.openConnection(); + HttpURLConnection siteConnection = (HttpURLConnection) pageURL.openConnection(); + siteConnection.connect(); + if(siteConnection.getHeaderField("Location") != null ) + { + String redirect = siteConnection.getHeaderField("Location"); + siteConnection.disconnect(); + pageURL = new URL(redirect); + siteConnection = (HttpURLConnection) pageURL.openConnection(); + siteConnection.connect(); + } Charset charset = getConnectionCharset(siteConnection); BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset)); String inputLine; @@ -184,7 +194,7 @@ public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOExceptio * @return the Charset object for response charset name; * if it's not found then the default charset. */ - private static Charset getConnectionCharset(URLConnection connection) { + private static Charset getConnectionCharset(HttpURLConnection connection) { String contentType = connection.getContentType(); if (contentType != null && contentType.length() > 0) { contentType = contentType.toLowerCase();