public class MsgExtractor
{
public static final String PROPERTY_SUBJECT = "Subject";
public static final String PROPERTY_FROM = "From";
public static final String PROPERTY_TO = "To";
public static final String PROPERTY_CC = "Cc";
public static final String PROPERTY_SENT = "Sent";
public static final String PROPERTY_RECEIVED = "Received";
private static final Map TEMPLATE_PROPERTY_2_GET_METHODS;
static {
TEMPLATE_PROPERTY_2_GET_METHODS = new HashMap();
TEMPLATE_PROPERTY_2_GET_METHODS.put(PROPERTY_SUBJECT, "getSubject");
TEMPLATE_PROPERTY_2_GET_METHODS.put(PROPERTY_FROM, "getDisplayFrom");
TEMPLATE_PROPERTY_2_GET_METHODS.put(PROPERTY_TO, "getDisplayTo");
TEMPLATE_PROPERTY_2_GET_METHODS.put(PROPERTY_CC, "getDisplayCc");
TEMPLATE_PROPERTY_2_GET_METHODS.put(PROPERTY_SENT, "getClientSubmitTime");
TEMPLATE_PROPERTY_2_GET_METHODS.put(PROPERTY_RECEIVED, "getMessageDeliveryTime");
}
final String[] properties;
public MsgExtractor(final String[] properties)
{
this.properties = (properties == null ? new String[] {} : properties);
}
public Map parseMetaData(final byte[] data) throws MetaDataExtractionException
{
// check at first whether byte array data is null and throw an exception if it's null
...
if (properties.length == 0) {
return Collections.EMPTY_MAP;
}
Map metaData = new HashMap();
OutlookMessage outlookMessage = new OutlookMessage(new ByteArrayInputStream(data));
try {
for (int i = 0; i < properties.length; i++) {
Object propertyValue = null;
String strMethod = (String) TEMPLATE_PROPERTY_2_GET_METHODS.get(properties[i]);
if (strMethod != null) {
Method method = OutlookMessage.class.getMethod(strMethod, (Class[]) null);
propertyValue = method.invoke(outlookMessage, (Object[]) (Object[]) null);
} else {
LOG.warn("No Get-Method to the MAPI-Property was found for the defined template property '"
+ properties[i] + "'");
}
// buffer property values
metaData.put(properties[i], propertyValue);
}
} catch (final Exception e) {
// failed to extract meta data ==> error handling
...
}
return metaData;
}
}
The main class is OutlookMessage which hides a magic work. It looks as followsimport java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.Iterator;
import org.apache.commons.io.EndianUtils;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
**
* Reads an Outlook MSG File in and provides hooks into its data structure. Some hints to the structure were found under
*
* <ul>
* <li>http://www.fileformat.info/format/outlookmsg/</li>
* <li>http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/</li>
* <li>http://www.tech-archive.net/Archive/Development/microsoft.public.win32.programmer.ole/2006-08/msg00123.html
* </li>
* </ul>
*
*/
public class OutlookMessage
{
private static final Log LOG = LogFactory.getLog(OutlookMessage.class);
/** id for the mapi property "subject" */
private static final int PROP_SUBJECT = 0x0037;
/** id for the mapi property "display from" */
private static final int PROP_DISPLAY_FROM = 0x0C1A;
/** id for the mapi property "display to" */
private static final int PROP_DISPLAY_TO = 0x0E04;
/** id for the mapi property "display cc" */
private static final int PROP_DISPLAY_CC = 0x0E03;
/** reversed endian format of the property id "client submit time" */
private static final byte[] PROP_CLIENT_SUBMIT_TIME = {0x40, 0x00, 0x39, 0x00};
/** reversed endian format of the property id "message delivery time" */
private static final byte[] PROP_MESSAGE_DELIVERY_TIME = {0x40, 0x00, 0x06, 0x0E};
private String subject;
private String displayFrom;
private String displayTo;
private String displayCc;
private Date clientSubmitTime;
private Date messageDeliveryTime;
public OutlookMessage(String filename) throws IOException
{
this(new FileInputStream(new File(filename)));
}
public OutlookMessage(InputStream in)
{
try {
POIFSFileSystem fs = new POIFSFileSystem(in);
initMapiProperties(fs);
} catch (IOException ioe) {
LOG.warn("Some properties could be not parsed from given MSG message " + ioe);
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e) {
;
}
}
}
}
private void initMapiProperties(POIFSFileSystem fs) throws IOException
{
DirectoryEntry root = fs.getRoot();
for (Iterator iter = root.getEntries(); iter.hasNext();) {
Entry entry = (Entry) iter.next();
if (!(entry instanceof DocumentEntry)) {
continue;
}
String entryName = entry.getName();
if (entryName == null) {
continue;
}
// parse MAPI properties
if (entryName.startsWith("__substg1.0_")) {
byte[] substgBytes = getBytes((DocumentEntry) entry);
int id = Integer.parseInt(entryName.substring(12, 16), 16);
int type = Integer.parseInt(entryName.substring(16, 20), 16);
if (id == PROP_SUBJECT) {
// subject
this.subject = getString(substgBytes, isUnicodeString(type));
} else if (id == PROP_DISPLAY_FROM) {
// display from
this.displayFrom = getString(substgBytes, isUnicodeString(type));
} else if (id == PROP_DISPLAY_TO) {
// display to
this.displayTo = getString(substgBytes, isUnicodeString(type));
} else if (id == PROP_DISPLAY_CC) {
// display cc
this.displayCc = getString(substgBytes, isUnicodeString(type));
}
} else if (entryName.startsWith("__properties_version1.0")) {
byte[] propBytes = getBytes((DocumentEntry) entry);
int offset = 0;
int bytesLength = propBytes.length;
while (offset + 16 <= bytesLength) {
byte[] propId = read4(propBytes, offset);
if (compare(propId, PROP_CLIENT_SUBMIT_TIME)) {
// read value
this.clientSubmitTime = getDate(propBytes, offset);
} else if (compare(propId, PROP_MESSAGE_DELIVERY_TIME)) {
// read value
this.messageDeliveryTime = getDate(propBytes, offset);
}
offset = offset + 16;
}
}
}
}
private byte[] getBytes(DocumentEntry docEntry) throws IOException
{
DocumentInputStream dis = new DocumentInputStream(docEntry);
byte[] propBytes = new byte[dis.available()];
try {
byte[] bytes = new byte[4096];
int readCount;
int curPosition = 0;
while ((readCount = dis.read(bytes)) > -1) {
System.arraycopy(bytes, 0, propBytes, curPosition, readCount);
curPosition = curPosition + readCount;
}
} finally {
dis.close();
}
return propBytes;
}
private String getString(byte[] bytes, boolean isUnicode)
{
if (ArrayUtils.isEmpty(bytes)) {
return null;
}
try {
String str;
if (isUnicode) {
str = new String(bytes, 0, bytes.length, "UTF-16LE");
} else {
str = new String(bytes, 0, bytes.length, "ISO8859_1");
}
int len = str.length();
while (len > 0 && str.charAt(len - 1) == '\0') {
len--;
}
if (len != str.length()) {
str = str.substring(0, len);
}
if (StringUtils.isBlank(str)) {
str = null;
}
return str;
} catch (UnsupportedEncodingException ignore) {
;
}
return null;
}
private boolean isUnicodeString(int type)
{
return (type == 0x001F ? true : false); // for not unicode string type = 0x001E
}
private byte[] read4(byte[] data, int offset)
{
byte[] readBytes = new byte[4];
System.arraycopy(data, offset, readBytes, 0, 4);
return readBytes;
}
private byte[] read8(byte[] data, int offset)
{
byte[] readBytes = new byte[8];
System.arraycopy(data, offset, readBytes, 0, 8);
return readBytes;
}
private boolean compare(byte[] b1, byte[] b2)
{
for (int i = 0; i < b1.length; ++i) {
if (b1[i] != b2[i]) {
return false;
}
}
return true;
}
private Date getDate(byte[] propBytes, int offset)
{
// read value
byte[] value = read8(propBytes, offset + 8);
// convert to long (reverse Endian format)
long time = EndianUtils.readSwappedLong(value, 0);
// FILETIME 64-bit int number of 100ns periods since Jan 1, 1601 ==>
// convert ns to ms and substruct milliseconds between 1/1/1601 and 1/1/1970
time = (time / 10 / 1000) - 1000L * 60L * 60L * 24L * (365L * 369L + 89L);
return new Date(time);
}
// getter
public String getSubject() {return subject;}
public String getDisplayFrom() {return displayFrom;}
public String getDisplayTo() {return displayTo;}
public String getDisplayCc() {return displayCc;}
public Date getClientSubmitTime() {return clientSubmitTime;}
public Date getMessageDeliveryTime() {return messageDeliveryTime;}
}
Parse of MAPI properties in the stream __substg1.0_ don't cause any problems. That are quite normally strings. But some properties are in a binary stream called __properties_version1.0. This is the most interesting stream.Hint to the structure of __properties_version1.0: Each MAPI property has a well documented hex id, available on the MSDN site. For example, the property PR_CLIENT_SUBMIT_TIME has an identifier of 0x00390040. If you open the properties stream and work with 16 byte rows, divided into two 8 byte sections (just like a hex editor), you will see that the property identifier is in the first 8 byte section and the value is in the second. The trick is to reverse the Endian of the property ID. So the ID for the PR_CLIENT_SUBMIT_TIME property becomes 40 00 39 00. Locate this hex block and you will have the value (in Big Endian FILETIME format) in the next 8 byte section.
With a little code extension you can extract "creation time" (reversed endian format of the property id {0x40, 0x00, 0x07, 0x30}), "last modification time" (reversed endian format of the property id {0x40, 0x00, 0x08, 0x30}) or all other imaginable properties. You can use a Viewer for MS Outlook Messages to find out proper endian formats. I have tried this approach for MSG files produced with MS Outlook 2003 / 2007, for unicode / non unicode messages and any other combinations. It was working in all cases.
No comments:
Post a Comment
Note: Only a member of this blog may post a comment.