Dal formato PDF al formato HTML
Generalemente i documenti vengono portati verso il formato pdf anche quando nascono in altri formati per la versatilità del formato pdf ,qualche volta può essere utile estrarre il testo da un pdf per andare verso altri formati quale ad esmpio html, quest’ultimo aumenta la capacità di visualizzazione su dispositivi mobili dove visualizzare un documento pdf è arduo a meno di utilizzare reader dedicati (ed a pagamento),
in .net andare da pdf ad html per i testi è molto semplice:
quello che occorre:
1) .net :-) ,
2) PdfBox ,a mio avviso una delle migliori librerie open per la gestione del formato pdf
3) IKVM una java virtual machine open per .net (http://www.ikvm.net/) che permette di usare pdfbox (java) in .net
come fare :
aggiungiamo le librerie come riferimenti al nostro progetto winforms
istanziamo un documento PDDocument
usando il metodo load PDDocument.load(pathdelfilepdf)
istanziamo un oggetto PDFTextStripper
e ne usiamo il metodo getText a cui passiamo come argomento il il risultato delle load precedente
un pò di pseudocodice;
Dim pdfDocument As PDDocument
pdfDocument = PDDocument.load(Me.TextBox1.Text.ToString)
Dim pdfLoader As PDFTextStripper
pdfLoader = New PDFTextStripper
Dim pdfOnString As String = pdfLoader.getText(pdfDocument)
Me.FromPdfToHtml(pdfOnString)
implementiamo un metodo che costruisca lo scheletro html esterno,l’oggetto currFont rappresenta
il risultato delle scelta effettuata chiamando un oggetto FontDialog
Private Function FromPdfToHtml(ByVal pdfString As String) As Boolean
Dim result As Boolean = False
Dim html As String = ”
html = html & Me.currFont.FontFamily.Name
html = html & “;font-size:” & Me.currFont.SizeInPoints & “pt>”
html = html + pdfString
html = html + “”
Dim str As String = “”
If (Me.WriteHtmlToFile(html)) Then
str = “File scritto”
Else
str = “File non scritto”
End If
MessageBox.Show(str)
Return result
End Function
la seguente funzione si occupa di scrivere il file html si disco
Private Function WriteHtmlToFile(ByVal inputHtml As String) As Boolean
Dim writeProcess As Boolean = False
Try
Dim writer As StreamWriter = New StreamWriter(Me.TextBox2.Text)
writer.Write(inputHtml)
writer.Close()
writeProcess = True
Catch ex As Exception
MessageBox.Show(ex.Message.ToString() & ex.Source.ToString() & ex.StackTrace.ToString())
End Try
Return writeProcess
End Function
il codebehind per un form che svolga questa funzione è ad esempio il seguente
Imports IKVM
Imports gnu
Imports org.pdfbox
Imports org.pdfbox.util
Imports org.pdfbox.pdmodel
Imports System.IO
Imports System.Text
Imports org.fontbox
Public Class Form1
Private currFileInput As String = Nothing
Private currFileOutput As String = Nothing
Private currFont As System.Drawing.Font
Private Sub Form1_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
End Sub
Public Sub New()
‘ This call is required by the Windows Form Designer.
InitializeComponent()
‘ Add any initialization after the InitializeComponent() call.
End Sub
Private Sub Button2_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button2.Click
If (Me.OpenFileDialog1.ShowDialog() = Windows.Forms.DialogResult.OK) Then
Dim file As String = Me.OpenFileDialog1.FileName
Me.currFileInput = file
Me.TextBox1.Text = Me.currFileInput
End If
End Sub
Private Sub Button3_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button3.Click
If (Me.SaveFileDialog1.ShowDialog() = Windows.Forms.DialogResult.OK) Then
Me.currFileOutput = Me.SaveFileDialog1.FileName
Me.TextBox2.Text = Me.currFileOutput
End If
End Sub
Private Sub Button4_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button4.Click
If (Me.FontDialog1.ShowDialog() = Windows.Forms.DialogResult.OK) Then
Me.TextBox3.Text = Me.FontDialog1.Font.Name & Me.FontDialog1.Font.SizeInPoints.ToString
Me.currFont = Me.FontDialog1.Font
End If
End Sub
Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
If (Me.ControlValidText(Me.TextBox1) And Me.ControlValidText(Me.TextBox2) And Me.ControlValidText(Me.TextBox3)) Then
Dim pdfDocument As PDDocument
pdfDocument = PDDocument.load(Me.TextBox1.Text.ToString)
Dim pdfLoader As PDFTextStripper
pdfLoader = New PDFTextStripper
Dim pdfOnString As String = pdfLoader.getText(pdfDocument)
Me.FromPdfToHtml(pdfOnString)
End If
End Sub
Private Function WriteHtmlToFile(ByVal inputHtml As String) As Boolean
Dim writeProcess As Boolean = False
Try
Dim writer As StreamWriter = New StreamWriter(Me.TextBox2.Text)
writer.Write(inputHtml)
writer.Close()
writeProcess = True
Catch ex As Exception
MessageBox.Show(ex.Message.ToString() & ex.Source.ToString() & ex.StackTrace.ToString())
End Try
Return writeProcess
End Function
Private Function FromPdfToHtml(ByVal pdfString As String) As Boolean
Dim result As Boolean = False
Dim html As String = ”
html = html & Me.currFont.FontFamily.Name
html = html & “;font-size:” & Me.currFont.SizeInPoints & “pt>”
html = html + pdfString
html = html + “”
Dim str As String = “”
If (Me.WriteHtmlToFile(html)) Then
str = “File scritto”
Else
str = “File non scritto”
End If
MessageBox.Show(str)
Return result
End Function
Private Function ControlValidText(ByVal inputControl As Control) As Boolean
‘this method control if ‘control’ have a text property
‘not null and not empty (“”)
Dim resReturn As Boolean = False
If (inputControl.Text Is Nothing) = False Then
If (inputControl.Text.Length > 0) Then
resReturn = True
End If
End If
Return resReturn
End Function
End Class
Friday, April 16, 2010
Subscribe to:
Post Comments (Atom)

No comments:
Post a Comment