#! /usr/bin/python2.7
# If you do not have python 3.0 installed, you can remove the "2.7" from the end of the previous line.

#This script requires unzip, antiword, aft (almost free text), pdfimages (in poppler-utils), imagemagick, 
#and last but not least cat_open_xml.pl by Kristinn Gudjonsson.
#The last file should be located at /usr/bin, chmod'ed 755, without the .pl extension for this script to work.

#Importing stuff
import sys
import os
import codecs
from subprocess import Popen, PIPE

arg = sys.argv #This array contains the input arguments to this script. Entry no. 0 is the name of this script.

#Print out info if wrong file or no file or -h is given.
if (len(arg)==1 or arg[1] == '-h'):
	print 'Usage: whim myDoc.doc\nInput a *.doc or *.docx file, and the script will create a *.tex file which contains the text of the file.'
	print 'Symbols will be converted and most formatting will survive. Images will be inserted at the end of the .tex file.'
	exit()

#Global variables from the rc file
editor = ''
home = os.getenv('HOME')
#print home
#We open the file ~/.whimrc. If this fails, i.e. the file does not exist, we assume this is the first
#time whim is run, and ask the user some questions to set up a .whimrc-file for him.
try: 
	dot = open(home + '/.whimrc') #Opens file in read mode.
	params = dot.read()
	dot.close()
	#We handle the config params:
	editor_pos=params.find("editor=")
	if editor_pos == -1:
		raise IOError('NoEditor') #If editor has not been defined, we create new rc-file.
	else:
		tmp = params[editor_pos+7:]
		tmp=tmp.split("\n")
		editor = tmp[0]

except IOError:
	newdot = open(home + '/.whimrc', 'w') #This open file in write-mode, creating it if it does not exist, overwriting if it does.
	print "Welcome to Whim!\nYour .whimrc file was not found, assuming this is your first run, we will create one.\n"
	editor = raw_input("What is your favourite editor?")
	newdot.write( "editor=" + editor + "\n")
	#More configuration can be handled just like the two previous lines.
	newdot.close()
	#Note that we set the variable in the script, as well as saving it to .whimrc. This makes the script run OK the first time!


#Convenient names
doc=-1
docx=-1
if arg[1].find('.docx') == -1:
	doc = arg[1].find('.doc')
else:
	docx = arg[1].find('.docx')

#Check if input is valid
if (doc == -1 and docx == -1):
	print 'Input must be a .doc or .docx document!'
	exit()
#Make sure these stay alive outside loops.
name = ''
longname = ''

#If docx, use the docx tool
if docx != -1:
	#Get name
	name = arg[1][:docx]
	#Get text file
	cmd = '/usr/bin/cat_open_xml "' + arg[1] + '" > $PWD/temp.aft'
	p = Popen(cmd, shell=True)
	p.wait()# Waaait for it!
	#cat_open_xml appends the line 'returning from a call...', so we remove it.	
	cmd = 'sed "/returning from a call../d" $PWD/temp.aft > $PWD/temp2.aft; mv $PWD/temp2.aft $PWD/temp.aft' #Sed is a bit weird...
	p = Popen(cmd, shell=True)
	p.wait()
	#Get images, if any
	longname = name.replace(' ','')
	cmd = 'mkdir temp' + longname + '; /usr/bin/unzip -j "' + arg[1] + '" word/media/* -d $PWD/temp' + longname
	p = Popen(cmd, shell=True, stdout=PIPE)
	p.wait()
	all = p.communicate() #The return is a list with both stdout and stderr,
	message = all[0]  #we want stdout.
	warning = all[1]  #we want stderr.
	files = []
	random = ""
	if warning.find('mkdir:') != -1: #If this is true, directory temp+longname already exists, so we make a different one:
		random=293 #Chosen by adding up many dice rolls, guaranteed to be random.
		cmd = 'mkdir temp' + longname + random + '; /usr/bin/unzip -j "' + arg[1] + '" word/media/* -d $PWD/temp' + longname
		p = Popen(cmd, shell=True, stdout=PIPE)
		p.wait()
		message = p.communicate()[0]
	longname = longname + random #If the previous if-statement did not match, random is just an empty string.
	if message.find('filename not matched') == -1: #If this is true, images have been extracted.
		cmd = 'ls $PWD/temp' + longname
		p = Popen(cmd, shell=True, stdout=PIPE)
		p.wait()
		filestring = p.communicate()[0] #This string now contains all image names
		files = filestring.split('\n') #This is now a list of image names. ls separates with newline.
		for item in files: #We remove Microsoft's .emz-files, which are metadata files
			if item.find('.emz') != -1:
				files.remove(item)
	#We have extracted the images, and removed garbage from our list of files

#Else use antiword
if doc != -1:
	#Get name
	name = arg[1][:doc]
	#Get text file
	cmd = '/usr/bin/antiword "' + arg[1] + '" > $PWD/temp.aft'
	p = Popen(cmd, shell=True)
	p.wait()
	#Get images, if any
	longname = name.replace(' ','')
	cmd = '/usr/bin/antiword -m 8859-15.txt -a a4 -i 0 "' + arg[1] + '" > $PWD/temp.aft.pdf'
	p = Popen(cmd, shell=True)
	p.wait()
	cmd = 'mkdir $PWD/temp' + longname + '; pdfimages -j $PWD/temp.aft.pdf $PWD/temp' + longname + '/; rm $PWD/temp.aft.pdf'
	p = Popen(cmd, shell=True)
	p.wait()
	warning = p.communicate()[1] #This gives us stderr
	random=""
	if warning.find('mkdir:') != -1: #If this is true, directory temp+longname already exists, so we make a different one:
		random=293 #Chosen by adding up many dice rolls, guaranteed to be random.
		cmd = 'mkdir temp' + longname + random + '; /usr/bin/unzip -j "' + arg[1] + '" word/media/* -d $PWD/temp' + longname
		p = Popen(cmd, shell=True, stdout=PIPE)
		p.wait()
	longname = longname + random #If the previous if-statement did not match, random is just an empty string.
	cmd = 'ls $PWD/temp' + longname
	p = Popen(cmd, shell=True, stdout=PIPE)
	p.wait()
	filestring = p.communicate()[0]
	if filestring != '':
		files = filestring.split('\n')
	else:
		files = []
#We have extracted images from doc or docx, they are in temp-folder. We need to move images in 'files' to $PWD:
for item in files:
	if item != '':
		cmd = 'mv "$PWD/temp' + longname + '/' + item + '" $PWD'
		p = Popen(cmd, shell=True)
		p.wait()

#If we have a .doc, images can be stored in .ppm format, which is utterly useless. We convert them to png:
if doc != -1:
	for i in xrange(len(files)):
		c = files[i].find('.ppm')
		if c != -1:
			filename = files[i][:c]#strips away .ppm from the filename
			cmd = 'convert $PWD/' + files[i] + ' $PWD/' + filename + '.png; rm $PWD/./' + files[i]
			print cmd
			p = Popen(cmd, shell=True)
			p.wait()
			#We also update the names in our list.
			files[i] = filename + '.png'

#Send the output from doc[x] into aft, which creates the tex-file.
cmd = 'aft ' + '--output="' + name + '.tex" ' + '--type=tex temp.aft'
p = Popen(cmd, shell=True)
p.wait()

#Remove the temp-file and the temp directory
cmd = 'rm $PWD/temp.aft; rm -rf $PWD/temp' + longname
p = Popen(cmd, shell=True)
p.wait()

#Edit the .tex-file to include the images (if any). They are placed at the bottom of the document.
#We use includegraphics in order to do this, AFT includes the graphicx package for us.
for item in files:
	if item != '':
		cmd = 'sed -i "/end{document}/i includegraphics[width=0.8\linewidth]{' + item + '}" $PWD/"' + name + '".tex'
		p = Popen(cmd, shell=True)
		p.wait()
#We do not bother to put graphics into figure environment, as the user will need to edit this to his likings anyway.

#Fire up vim, ehm., text editor of choice, for the user to edit the file.
cmd = editor + ' $PWD/"' + name + '.tex"'
p = Popen(cmd, shell=True)
p.wait()



