Share this page 

Remove XML tags from a string to keep only text Tag(s): XML


First we define an XSLT template.

[onlytext.xsl]


<?xml version="1.0" encoding="UTF-8"?>

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:fo="http://www.w3.org/1999/XSL/Format">

    <xsl:output method="text" indent="no"/>

    <xsl:template match="//text()[normalize-space(.) = '']>

        <xsl:text>&#xA;</xsl:text>

    </xsl:template>

</xsl:stylesheet>

Suppose we have this XML file

[howto.xml]


<?xml version="1.0"?>

<howto>

   <topic id="1">

      <title>Java</title>

      <url>http://www.rgagnon.com/topics/java-io.html</url>

   </topic>

   <topic id="2">

      <title>XML</title>

      <url>http://www.rgagnon.com/topics/java-xml.html</url>

   </topic>

   <topic id="3">

      <title>Javascript</title>

      <url>http://www.rgagnon.com/topics/js-language.html</url>

   </topic>

   <topic id="4">

      <title>VBScript</title>

      <url>http://www.rgagnon.com/topics/wsh-vbs.html</url>

   </topic>

</howto>

And the Java code to apply template to the XML.


import java.io.File;



import javax.xml.parsers.DocumentBuilder;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import javax.xml.transform.stream.StreamSource;



import org.w3c.dom.Document;



public class XMLUtils {

   public static void main(String args[]) throws Exception {

       File stylesheet = new File("/temp/onlytext.xsl");

       File xmlSource = new File("/temp/howto.xml");

       File txtOutput = new File("/temp/howto.txt");



       StreamSource stylesource = new StreamSource(stylesheet);

       Transformer transformer = TransformerFactory.newInstance()

             .newTransformer(stylesource);



       DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

       DocumentBuilder builder = factory.newDocumentBuilder();

       Document document = builder.parse(xmlSource);



       transformer.transform(new DOMSource(document), new StreamResult(txtOutput));

       // output to console :

       //  transformer.transform(new DOMSource(document), new StreamResult(System.out));



       System.out.println("Done.");

   }

}

The result :





Java

http://www.rgagnon.com/topics/java-io.html





XML

http://www.rgagnon.com/topics/java-xml.html





Javascript

http://www.rgagnon.com/topics/js-language.html





VBScript

http://www.rgagnon.com/topics/wsh-vbs.html




See also : Remove HTML tags from a file to extract only the TEXT.