Trying to get cleaner XML output from a text file

I

iainemsley

I'm using Python2.5 to try and convert some text files into XML using
xml.minidom. I'm currently doing some plays which have a structure
like
Scene 1
Act 1
blah blah
Act2
blah blah
Scene 2
Act 1
and so on.
I'm trying to turn it into
<div type="scene">1
<div type="act">1
<speech />
</div>
<div type="act">2
<speech />
</div>
</div>
(or ideally <div type="scene" id="1"> bit I can always come back to
this later)
I've currently got:
<div id="" type="scene">
<div id=" " type="act">
<speech>
II
</speech>
</div>
</div>
<div id="" type="scene">
<div id=" " type="act">
<speech>
II
</speech>
</div>
</div>
<div id="" type="scene">
<div id=" " type="act">
The code I'm currently working with is:
from itertools import groupby
from xml.dom.minidom import Document

import re

text = open('\\texts\\midsummer_nights_dream_gut.txt').read()

def paragraphs(lines, is_separator=str.isspace, joiner=''.join):
for separator_group, lineiter in groupby(lines, key=is_separator):
if not separator_group:
yield joiner(lineiter)

def scene_node(scene):
global docText
docText = doc.createElement("div")
#need to set the type to book, verse, drama
docText.setAttribute("type", "scene")
#need set the id to what ever break name or id: i.e. chapter 1 or
act 1
docText.setAttribute("id", '')
tei.appendChild(docText)
for acts in actTxt.split(scene):
act_node(acts)

def act_node(act):
global actText
actText = doc.createElement("div")
#need to set the type to book, verse, drama
actText.setAttribute("type", "act")
#need set the id to what ever id: 1 or I
actText.setAttribute("id", ' ')
docText.appendChild(actText)
for p in paragraphs(act.splitlines(True)):
speech_node(p)

def speech_node(speech):
para = doc.createElement("speech")
actText.appendChild(para)
ptext = doc.createTextNode(speech)
para.appendChild(ptext)


doc = Document()
tei = doc.createElement("body")
doc.appendChild(tei)

sideTxt = re.compile(r"Scene\s+([1-9])", re.I)
actTxt = re.compile(r"Act\s+([1-9])", re.I)
for textStr in sideTxt.split(text):
scene_node(textStr)

print doc.toprettyxml(indent = " ")
I'd be grateful for some pointers about getting a cleaner output.

Thanks,

Iain
 
G

Gabriel Genellina

I'm using Python2.5 to try and convert some text files into XML using
xml.minidom. I'm currently doing some plays which have a structure
like
Scene 1
Act 1
blah blah
Act2
blah blah
Scene 2
Act 1
and so on.

(I think you get the hierarchy wrong: usually a play contains some Acts;
each act contains several Scenes)
I'm trying to turn it into
<div type="scene">1
<div type="act">1
<speech />
</div>
<div type="act">2
<speech />
</div>
</div>
(or ideally <div type="scene" id="1"> bit I can always come back to
this later)

Using ElementTree is a lot easier than minidom:

import sys
from itertools import groupby, count
import xml.etree.ElementTree as ET
import re

class Seq:
"Automatic sequencer for acts/scenes"
def __init__(self):
self.act_nr = count(1)
self.scene_nr = count(1)
def next_scene(self):
return self.scene_nr.next()
def next_act(self):
self.scene_nr = count(1)
return self.act_nr.next()
seq = Seq()

def add_act(body, act_text):
act = ET.SubElement(body, "div", type="act", id="a%s" % seq.next_act())
for scene_text in scene_sep.split(act_text):
add_scene(act, scene_text)

def add_scene(act, scene_text):
scene = ET.SubElement(act, "div", type="scene", id="%ss%s" %
(act.get("id"), seq.next_scene()))
for p in paragraphs(scene_text.splitlines(True)):
add_speech(scene, p)

def add_speech(scene, p):
speech = ET.SubElement(scene, "speech")
speech.text = p

body = ET.Element("body")
scene_sep = re.compile(r"Scene\s+[1-9]+", re.I)
act_sep = re.compile(r"Act\s+[1-9]+", re.I)
for act_text in act_sep.split(text):
add_act(body, act_text)

doc = ET.ElementTree(body)
doc.write(sys.stdout)
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Similar Threads


Members online

No members online now.

Forum statistics

Threads
473,769
Messages
2,569,579
Members
45,053
Latest member
BrodieSola

Latest Threads

Top