Extract links from an HTML pageTag(s): String/Number Networking
Using javax.swing.text.html.HTMLEditorKit
import java.io.IOException; import java.io.FileReader; import java.io.Reader; import java.util.List; import java.util.ArrayList; import javax.swing.text.html.parser.ParserDelegator; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTML.Attribute; import javax.swing.text.MutableAttributeSet; public class HTMLUtils { private HTMLUtils() {} public static List<String> extractLinks(Reader reader) throws IOException { final ArrayList<String> list = new ArrayList<String>(); ParserDelegator parserDelegator = new ParserDelegator(); ParserCallback parserCallback = new ParserCallback() { public void handleText(final char[] data, final int pos) { } public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { if (tag == Tag.A) { String address = (String) attribute.getAttribute(Attribute.HREF); list.add(address); } } public void handleEndTag(Tag t, final int pos) { } public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { } public void handleComment(final char[] data, final int pos) { } public void handleError(final java.lang.String errMsg, final int pos) { } }; parserDelegator.parse(reader, parserCallback, false); return list; } public final static void main(String[] args) throws Exception{ FileReader reader = new FileReader("java-new.html"); List<String> links = HTMLUtils.extractLinks(reader); for (String link : links) { System.out.println(link); } } }
Using an HTML parser
In this HowTo, I will use the OpenSource package Jsoup.import java.io.IOException; import java.util.List; import java.util.ArrayList; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class HTMLUtils { private HTMLUtils() {} public static List<String>extractLinks(String url) throws IOException { final ArrayList<String> result = new ArrayList<String>(); Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); Elements imports = doc.select("link[href]"); // href ... for (Element link : links) { result.add(link.attr("abs:href")); } // img ... for (Element src : media) { result.add(src.attr("abs:src")); } // js, css, ... for (Element link : imports) { result.add(link.attr("abs:href")); } return result; } public final static void main(String[] args) throws Exception{ String site = "http://www.rgagnon.com/topics/java-language.html"; List<String> links = HTMLUtils.extractLinks(site); for (String link : links) { System.out.println(link); } } }
See also how to extract text from an HTML page.
mail_outline
Send comment, question or suggestion to howto@rgagnon.com
Send comment, question or suggestion to howto@rgagnon.com