Remove XML tags from a string to keep only text Tag(s): XML


First we define an XSLT template.

[onlytext.xsl]

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:fo="http://www.w3.org/1999/XSL/Format">
    <xsl:output method="text" indent="no"/>
    <xsl:template match="//text()[normalize-space(.) = '']>
        <xsl:text>&#xA;</xsl:text>
    </xsl:template>
</xsl:stylesheet>
Suppose we have this XML file

[howto.xml]

<?xml version="1.0"?>
<howto>
   <topic id="1">
      <title>Java</title>
      <url>http://www.rgagnon.com/topics/java-io.html</url>
   </topic>
   <topic id="2">
      <title>XML</title>
      <url>http://www.rgagnon.com/topics/java-xml.html</url>
   </topic>
   <topic id="3">
      <title>Javascript</title>
      <url>http://www.rgagnon.com/topics/js-language.html</url>
   </topic>
   <topic id="4">
      <title>VBScript</title>
      <url>http://www.rgagnon.com/topics/wsh-vbs.html</url>
   </topic>
</howto>

And the Java code to apply template to the XML.

import java.io.File;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.w3c.dom.Document;

public class XMLUtils {
   public static void main(String args[]) throws Exception {
       File stylesheet = new File("/temp/onlytext.xsl");
       File xmlSource = new File("/temp/howto.xml");
       File txtOutput = new File("/temp/howto.txt");

       StreamSource stylesource = new StreamSource(stylesheet);
       Transformer transformer = TransformerFactory.newInstance()
             .newTransformer(stylesource);

       DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
       DocumentBuilder builder = factory.newDocumentBuilder();
       Document document = builder.parse(xmlSource);

       transformer.transform(new DOMSource(document), new StreamResult(txtOutput));
       // output to console :
       //  transformer.transform(new DOMSource(document), new StreamResult(System.out));

       System.out.println("Done.");
   }
}
The result :


Java
http://www.rgagnon.com/topics/java-io.html


XML
http://www.rgagnon.com/topics/java-xml.html


Javascript
http://www.rgagnon.com/topics/js-language.html


VBScript
http://www.rgagnon.com/topics/wsh-vbs.html


See also : Remove HTML tags from a file to extract only the TEXT.
blog comments powered by Disqus