30

By using apache POI how to convert ms word file to pdf?

I an using the following code but its not working giving errors I guess I am importing the wrong classes?

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;

import org.apache.poi.hslf.record.Document;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;


public class TestCon {

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub

        POIFSFileSystem fs = null;  
         Document document = new Document(); 

         try {  
             System.out.println("Starting the test");  
             fs = new POIFSFileSystem(new FileInputStream("/document/test2.doc"));  

             HWPFDocument doc = new HWPFDocument(fs);  
             WordExtractor we = new WordExtractor(doc);  

             OutputStream file = new FileOutputStream(new File("/document/test.pdf")); 

             PdfWriter writer = PdfWriter.getInstance(document, file);  

             Range range = doc.getRange();
             document.open();  
             writer.setPageEmpty(true);  
             document.newPage();  
             writer.setPageEmpty(true);  

             String[] paragraphs = we.getParagraphText();  
             for (int i = 0; i < paragraphs.length; i++) {  

                 org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
                // CharacterRun run = pr.getCharacterRun(i);
                // run.setBold(true);
                // run.setCapitalized(true);
                // run.setItalic(true);
                 paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");  
             System.out.println("Length:" + paragraphs[i].length());  
             System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());  

             // add the paragraph to the document  
             document.add(new Paragraph(paragraphs[i]));  
             }  

             System.out.println("Document testing completed");  
         } catch (Exception e) {  
             System.out.println("Exception during test");  
             e.printStackTrace();  
         } finally {  
                         // close the document  
            document.close();  
                     }  
         }  
    }
Amedee Van Gasse
  • 7,280
  • 5
  • 55
  • 101
Harinder
  • 11,776
  • 16
  • 70
  • 126
  • 2
    Hello Denis when i try to convert word file to pdf i got fallowing error in import com.lowagie.text.Document; import com.lowagie.text.DocumentException; import com.lowagie.text.Paragraph; import com.lowagie.text.pdf.PdfWriter; please tell me with library i forgot to add it also if it is possible to give me a link for download – DynamicMind Aug 08 '11 at 11:36

8 Answers8

16

Got It solved

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;

import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
import com.lowagie.text.Paragraph;
import com.lowagie.text.pdf.PdfWriter;


import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;

import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;


public class TestCon {

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub

        POIFSFileSystem fs = null;  
        Document document = new Document();

         try {  
             System.out.println("Starting the test");  
             fs = new POIFSFileSystem(new FileInputStream("D:/Resume.doc"));  

             HWPFDocument doc = new HWPFDocument(fs);  
             WordExtractor we = new WordExtractor(doc);  

             OutputStream file = new FileOutputStream(new File("D:/test.pdf")); 

             PdfWriter writer = PdfWriter.getInstance(document, file);  

             Range range = doc.getRange();
             document.open();  
             writer.setPageEmpty(true);  
             document.newPage();  
             writer.setPageEmpty(true);  

             String[] paragraphs = we.getParagraphText();  
             for (int i = 0; i < paragraphs.length; i++) {  

                 org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
                // CharacterRun run = pr.getCharacterRun(i);
                // run.setBold(true);
                // run.setCapitalized(true);
                // run.setItalic(true);
                 paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");  
             System.out.println("Length:" + paragraphs[i].length());  
             System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());  

             // add the paragraph to the document  
             document.add(new Paragraph(paragraphs[i]));  
             }  

             System.out.println("Document testing completed");  
         } catch (Exception e) {  
             System.out.println("Exception during test");  
             e.printStackTrace();  
         } finally {  
                         // close the document  
            document.close();  
                     }  
         }  
    }
Harinder
  • 11,776
  • 16
  • 70
  • 126
  • @Harinder i am trying to execute this sample (conversion doc to pdf) and getting java.lang.NullPointerException: Attempt to invoke interface method 'org.w3c.dom.Node org.w3c.dom.Node.removeChild(org.w3c.dom.Node)' on a null object reference exception. Are you able to run this successfully on Android platform? – vickyVick May 16 '18 at 03:56
  • 4
    Hi @Harinder, Being a Long term user of STACK OVERFLOW, you would know code-only answers are discouraged here, could you edit your answer to explain why this answers the question? It'll help teach others rather than just encouraging copy-paste coding. Thanks very much :-) – Vetrivel PS Jan 03 '19 at 18:28
  • 2
    As @VetrivelPS mention, adding more information to this answer would have been useful. Especially the version of APIs used. – Sastrija Aug 06 '19 at 20:50
  • It's removing all headings, not considering images inside doc files. – cody123 Oct 21 '19 at 05:49
12

This worked For Me:-

Source :- http://www.programcreek.com/java-api-examples/index.php?api=org.apache.poi.xwpf.converter.pdf.PdfConverter

package pdf;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;

import org.apache.poi.xwpf.converter.pdf.PdfConverter;
import org.apache.poi.xwpf.converter.pdf.PdfOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

public class PDF {
    public static void main(String[] args) throws Exception {
          String inputFile="D:/TEST.docx";
          String outputFile="D:/TEST.pdf";
          if (args != null && args.length == 2) {
            inputFile=args[0];
            outputFile=args[1];
          }
          System.out.println("inputFile:" + inputFile + ",outputFile:"+ outputFile);
          FileInputStream in=new FileInputStream(inputFile);
          XWPFDocument document=new XWPFDocument(in);
          File outFile=new File(outputFile);
          OutputStream out=new FileOutputStream(outFile);
          PdfOptions options=null;
          PdfConverter.getInstance().convert(document,out,options);
        }
}
Kushagra Sahni
  • 161
  • 2
  • 7
  • @KamilIbadov :- Hey Buddy Use the following Maven Dependency :- org.apache.poi poi 3.13 org.apache.poi poi-ooxml 3.13 fr.opensagres.xdocreport org.apache.poi.xwpf.converter.pdf LATEST If you still face any error drop me a mail at kushagra.sahni93@gmail.com – Kushagra Sahni May 20 '17 at 22:40
  • org.apache.poi.xwpf.converter.pdf.PdfConverter (and PdfOptions) is not a part of Apache POI but xDocReport which misused Apache POI namespace See https://github.com/opensagres/xdocreport/issues/174 Nowadays their PdfConverter is in the package fr.opensagres.odfdom.converter.pdf – Fenix Nov 06 '17 at 16:55
  • Hi @Kushagra Sahni, Being a Long New user of STACK OVERFLOW, you would know code-only answers are discouraged here, could you edit your answer to explain why this answers the question? It'll help teach others rather than just encouraging copy-paste coding. Thanks very much :-) – Vetrivel PS Jan 03 '19 at 18:29
  • 1
    Getting this error : java.lang.NoClassDefFoundError: org/apache/poi/POIXMLDocumentPart – Anmol Jain Apr 07 '22 at 19:51
4

In addition to Kushagra's answer, here the updated maven dependencies:

    <dependency>
        <groupId>fr.opensagres.xdocreport</groupId>
        <artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
        <version>2.0.1</version>
    </dependency>
    <dependency>
        <groupId>fr.opensagres.xdocreport</groupId>
        <artifactId>fr.opensagres.xdocreport.converter</artifactId>
        <version>2.0.1</version>
    </dependency>
    <dependency>
        <groupId>fr.opensagres.xdocreport</groupId>
        <artifactId>fr.opensagres.poi.xwpf.converter.pdf</artifactId>
        <version>2.0.1</version>
    </dependency>
    <dependency>
        <groupId>fr.opensagres.xdocreport</groupId>
        <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
        <version>2.0.1</version>
    </dependency>
Erich13
  • 69
  • 5
  • 2
    Hi @Erich13, code-only answers are discouraged here, could you edit your answer to explain why this answers the question? It'll help teach others rather than just encouraging copy-paste coding. Thanks very much :-) – Vetrivel PS Jan 03 '19 at 18:27
3

The below code worked for me:

Public class DocToPdfConverter{

public static void main(String[] args) {

        String k=null;
        OutputStream fileForPdf =null;
        try {

            String fileName="/document/test2.doc";
            //Below Code is for .doc file 
            if(fileName.endsWith(".doc"))
            {
            HWPFDocument doc = new HWPFDocument(new FileInputStream(
                    fileName));
            WordExtractor we=new WordExtractor(doc);
            k = we.getText();

             fileForPdf = new FileOutputStream(new File(
                        "/document/DocToPdf.pdf")); 
            we.close();
            }

            //Below Code for 

            else if(fileName.endsWith(".docx"))
            {
                XWPFDocument docx = new XWPFDocument(new FileInputStream(
                        fileName));
                // using XWPFWordExtractor Class
                XWPFWordExtractor we = new XWPFWordExtractor(docx);
                 k = we.getText();

                 fileForPdf = new FileOutputStream(new File(
                            "/document/DocxToPdf.pdf"));    
                 we.close();
            }



            Document document = new Document();
            PdfWriter.getInstance(document, fileForPdf);

            document.open();

            document.add(new Paragraph(k));

            document.close();
            fileForPdf.close();



        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
Tim Malone
  • 3,364
  • 5
  • 37
  • 50
  • 3
    Hello, welcome to StackOverflow and thank you for your answer. When posting code, please indent it by 4 characters (or use the code-formatting button on the toolbar) to ensure it displays as code (I've suggested an edit for you to fix that). Also, as code-only answers are discouraged here, could you edit your answer to explain _why_ this answers the question? It'll help teach others rather than just encouraging copy-paste coding. Thanks very much! – Tim Malone Aug 12 '16 at 07:46
2

As a side note, it's also possible to read content on-the-fly directly from a Word/Excel content stream instead of reading it from the filesystem and serializing it to disk, for example when retrieving content from CMIS repositories:

e.g.

 //HWPFDocument docx = new HWPFDocument(fs);  
 HWPFDocument docx = new HWPFDocument(doc.getContentStream().getStream()); 

(doc is of type org.apache.chemistry.opencmis.client.api.Document and in this case I adapted your code to retrieve a word file from an Alfresco repository by means of opencmis and transformed it to PDF)

HTH

Mysticial
  • 464,885
  • 45
  • 335
  • 332
theshadow
  • 161
  • 1
  • 2
2

There are several steps here:

  1. Read Word document using POI into a format-agnostic form
  2. Convert format-agnostic form into PDF
  3. Write PDF

I don't know if POI will do step 2 for you. I'd recommend something else, like iText.

duffymo
  • 305,152
  • 44
  • 369
  • 561
  • 4
    The code in your initial post wasn't mentioning the lowagie/iText packages. I was already puzzled as to where to find something PDF related in the POI library. Duffymo is correct in the steps he listed. In a similar situation I use 'WordML' (Word 2003 xml format) that is transformed into FO and then rendered using Apache FOP. There are other possibilities, including OpenOffice API. Search through StackOverflow and you'll find plenty of questions/answers about Office2PDF. – Wivani Jun 06 '11 at 10:59
0

This save my day, i load docx file from an url and convert it to pdf:

pom.xml

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.13</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.13</version>
</dependency>
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
    <version>LATEST</version>
</dependency>

main_class

public String wordToPDFPOI(String url) throws Exception {
    InputStream doc = new URL(url).openStream();
    ByteArrayOutputStream baos = new ByteArrayOutputStream();

    XWPFDocument document = new XWPFDocument(doc);
    PdfOptions options = PdfOptions.create();
    PdfConverter.getInstance().convert(document, baos, options);
    String base64_encoded = Base64.encodeBytes(baos.toByteArray());

    return base64_encoded;
}
ImYuta
  • 41
  • 6
0

All of the answers above will fail if the document has images. I would not suggest you to use apache poi since its library to convert word to pdf have been discontinued now. As of today I don't think that there is any open source library which do the conversion (they require some dependencies like some need MS word to be installed, etc). The best way I could think of (it will only work if you are deploying project on linux machine) is that install Libre Office (open source) in the linux machine and run this :

  String command = "libreoffice --headless --convert-to pdf " + inputPath + " --outdir " + outputPath;
 
 try {
         Runtime.getRuntime().exec(command);
      } catch (IOException e) {
         e.printStackTrace();
      }
Anmol Jain
  • 336
  • 4
  • 12