forked from MITx/ocw2edx
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxbundle.py
executable file
·641 lines (541 loc) · 20.8 KB
/
xbundle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
#!/usr/bin/python
#
# XBundle class
#
# an xbundle file is an XML format file with element <xbundle>, which
# includes the following sub-elements:
#
# <metadata>
# <policies semester=...>: <policy> and <gradingpolicy>
# each contain the JSON for the corresponding file
# <about>: <file filename=...> </about>
# </metadata>
# <course semester="...">: course XML </course>
#
# The XBundle class represents an xbundle file; it can read and write
# the file, and it can import and export to standard edX (unbundled) format.
import os
import sys
import re
import string
import glob
from lxml import etree
from lxml.html.soupparser import fromstring as fsbs
from path import path # needs path.py
#-----------------------------------------------------------------------------
DEF_POLICY_JSON = """
{
"course/2013_Spring": {
"graceperiod": "1 day 5 hours 59 minutes 59 seconds",
"start": "2013-02-19T14:15",
"display_name": "The Challenges of Existence",
"showanswer": "attempted",
"rerandomize": "never",
"show_calculator": "Yes",
"tabs": [ {"type": "courseware"},
{"name": "Course Info", "type": "course_info"},
{"name": "Discussion", "type": "discussion"},
{"name": "Progress", "type": "progress"},
{"name": "Staff Grading", "type": "staff_grading"}
],
"remote_gradebook": {
"name" : "STELLAR:/project/mitxdemosite",
"section" : "r01"
},
"discussion_topics": {
"General": {
"sort_key": "A",
"id": "4201x_Spring2013_General"
},
"Feedback": {
"sort_key": "AA",
"id": "4201x_Spring2013_Feedback"
},
"Troubleshooting": {
"sort_key": "AB",
"id": "4201x_Spring2013_Troubleshooting"
}
}
}
}
"""
DEF_GRADING_POLICY_JSON = """
{
"GRADER" : [
{
"type" : "Homework",
"min_count" : 12,
"drop_count" : 2,
"short_label" : "HW",
"weight" : 0.15
},
{
"type" : "Lab",
"min_count" : 12,
"drop_count" : 2,
"category" : "Labs",
"weight" : 0.15
},
{
"type" : "Midterm",
"name" : "Midterm Exam",
"short_label" : "Midterm",
"weight" : 0.3
},
{
"type" : "Final",
"name" : "Final Exam",
"short_label" : "Final",
"weight" : 0.4
}
],
"GRADE_CUTOFFS" : {
"A" : 0.87,
"B" : 0.7,
"C" : 0.6
}
}
"""
#-----------------------------------------------------------------------------
class XBundle(object):
'''
An XBundle is defined by two elements: course and metadata.
metadata includes policies and about.
'''
DescriptorTags = ['course','chapter','sequential','vertical','html','problem','video',
'conditional', 'combinedopenended']
MapTags = dict(section='sequential')
DefaultSemester = '2013_Fall'
DefaultOrg = 'MITx'
PolicyTagMap = {'policy' : 'policy', 'gradingpolicy': 'grading_policy'}
html_parser = etree.HTMLParser(compact=False,recover=True,remove_blank_text=True)
def __init__(self, keep_urls=False, force_studio_format=False):
'''
if keep_urls=True then the original url_name attributes are kept upon import and export,
if nonrandom (ie non-Studio).
'''
self.course = etree.Element('course')
self.metadata = etree.Element('metadata')
self.urlnames = []
self.xml = None # only used if XML xbundle file was read in
self.keep_urls = keep_urls
self.force_studio_format = force_studio_format # sequential must be followed by vertical in export
return
#----------------------------------------
# creation by parts
def set_course(self,xml):
if not xml.tag=='course':
self.errlog('set_course should be called with a <course> element')
return
if not 'org' in xml.attrib:
xml.set('org',self.DefaultOrg)
if not 'semester' in xml.attrib:
xml.set('semester',self.DefaultSemester)
self.course = xml
def add_policies(self, policies):
'''add a policies XML subtree to the metadata'''
self.metadata.append(policies)
def set_about(self, about):
'''set about XML tree'''
xabout = self.metadata.find('about')
if xbout is not None:
self.metadata.remove(xabout)
self.metadata.append(xabout)
def add_about_file(self, filename, filedata):
'''add a file to the about element'''
about = self.metadata.find('about')
if about is None:
about = etree.SubElement(self.metadata,'about')
abfile = etree.SubElement(about, 'file')
abfile.set('filename',filename)
abfile.text = filedata
#----------------------------------------
# load/save
def load(self, fn):
"""
Load from xbundle.xml file
"""
self.xml = etree.parse(fn).getroot()
self.course = self.xml.find('course')
self.metadata = self.xml.find('metadata')
self.errlog("course id = %s" % self.course_id())
def save(self, fn='xbundle.xml', fp=None):
"""
Save to xbundle.xml file
"""
if fp is None:
fp = open(fn,'w')
fp.write(str(self))
def __str__(self):
xml = etree.Element('xbundle')
self.xml = xml
xml.append(self.metadata)
xml.append(self.course)
return self.pp_xml(xml)
#----------------------------------------
# import/export
def import_from_directory(self, dir='./'):
'''
Create xbundle from edX xml directory.
Using this is a great way to sanitize directory structure
and also normalize url_name filenames (and make them
meaningfully human readable).
'''
dir = path(dir)
self.metadata = etree.Element('metadata')
self.import_metadata_from_directory(dir)
self.import_course_from_directory(dir)
def import_metadata_from_directory(self, dir):
# load policies
# print "ppath = ", (path(dir) / 'policies/*')
for pdir in glob.glob(path(dir) / 'policies/*'):
# print "pdir=",pdir
policies = etree.Element('policies')
policies.set('semester',os.path.basename(pdir))
for fn in glob.glob(path(pdir) / '*.json'):
x = etree.SubElement(policies,os.path.basename(fn).replace('_','').replace('.json',''))
x.text = open(fn).read()
self.add_policies(policies)
# load about files
for afn in glob.glob(dir / 'about/*'):
try:
self.add_about_file(os.path.basename(afn), open(afn).read())
except Exception as err:
print "Oops, failed to add file %s, error=%s" % (afn, err)
def import_course_from_directory(self, dir):
'''load course tree, removing intermediate descriptors with url_name'''
dir = path(dir)
x = etree.parse(dir / 'course.xml').getroot()
semester = x.get('url_name','') # the url_name of <course> is special - the semester
cxml = self.import_xml_removing_descriptor(dir, x)
cxml.set('semester',semester)
self.course = cxml
self.fix_old_course_section()
self.fix_old_descriptor_name(self.course)
# print self.pp_xml(self.course)
def fix_old_descriptor_name(self, xml):
'''
Turn name -> display_name on descriptor tags
'''
if xml.tag in self.DescriptorTags:
if 'name' in xml.attrib and not xml.get('display_name',''):
xml.set('display_name',xml.get('name'))
xml.attrib.pop('name')
for child in xml:
self.fix_old_descriptor_name(child)
def fix_old_course_section(self):
'''
Remove <section>
'''
for sect in self.course.findall('.//section'):
for seq in sect.findall('.//sequential'):
for k in seq:
seq.addprevious(k)
sect.remove(seq) # remove sequential from inside section
sect.tag = 'sequential'
def is_not_random_urlname(self, un):
# random urlname eg: 55bc076ad06e4ede9d0561948c03be2f
nrand = len('55bc076ad06e4ede9d0561948c03be2f')
if not len(un)==nrand:
return True
ndigits = len([z for z in un if z in string.digits])
if ndigits<6:
return True
return False # ie seems to be random
def import_xml_removing_descriptor(self, dir, xml):
'''
load XML file, recursively following and removing intermediate
descriptors with url_name.
if element is a DescriptorTag element, and display_name is missing, then
use its url_name, if that is available.
dir should be a path.
'''
un = xml.get('url_name','')
if xml.tag in self.DescriptorTags and 'url_name' in xml.attrib and un:
dxml = etree.parse(dir / xml.tag / (un+'.xml')).getroot()
dxml.attrib.update(xml.attrib)
dxml.attrib.pop('url_name')
if self.keep_urls and self.is_not_random_urlname(un):
dxml.set('url_name_orig', un) # keep url_name as url_name_orig
if dxml.tag in self.DescriptorTags and dxml.get('display_name') is None:
if not dxml.tag=='course': # special case: don't add display_name to course
dxml.set('display_name',un)
xml = dxml
fn = xml.get('filename','')
if xml.tag in ['html','problem'] and fn: # special for <html filename="..." display_name="..."/>
# and <problem filename="...">
if xml.tag=='html':
if not fn.endswith('.html'):
fn += '.html'
if not fn.startswith('html/'):
fn = 'html/' + fn
options = dict(parser=self.html_parser)
elif xml.tag=='problem':
if not fn.endswith('.xml'):
fn += '.xml'
if not fn.startswith('problems/'):
fn = 'problems/' + fn
options = {}
try:
dxml = etree.parse(dir / fn, **options).getroot()
except Exception as err:
print "Error! Can't load and parse HTML file %s, error:" % (dir/xml.tag/fn)
print err
if dxml is not None:
if 'xmlns' in dxml.attrib:
dxml.attrib.pop('xmlns')
dxml.attrib.update(xml.attrib)
dxml.attrib.pop('filename')
if dxml.tag in self.DescriptorTags and dxml.get('display_name') is None:
dxml.set('display_name',un)
xml = dxml
for child in xml:
dchild = self.import_xml_removing_descriptor(dir, child) # recurse
if not dchild==child:
child.addprevious(dchild) # replace descriptor with contents
xml.remove(child)
return xml
def export_to_directory(self, exdir='./'):
'''
Export xbundle to edX xml directory
First insert all the intermediate descriptors needed.
Do about and XML separately.
'''
coursex = etree.Element('course')
semester = self.course.get('semester')
coursex.set('url_name',semester)
coursex.set('org',self.course.get('org'))
coursex.set('course',self.course.get('course'))
self.export = self.make_descriptor(self.course, semester)
self.export.append(self.course)
self.add_descriptors(self.course)
# print self.pp_xml(self.export)
self.dir = self.mkdir(path(exdir) / self.course_id())
self.export_meta_to_directory()
self.export_xml_to_directory(self.export[0])
# write out top-level course.xml
open(self.dir/'course.xml','w').write(self.pp_xml(coursex))
def export_meta_to_directory(self):
'''
Write out metadata (about and policy) to directory.
'''
pdir = self.mkdir(self.dir / 'policies')
for pxml in self.metadata.findall('policies'):
semester = pxml.get('semester')
dir = self.mkdir(pdir / semester)
for k in pxml:
fn = self.PolicyTagMap.get(k.tag,k.tag) + '.json'
open(dir/fn,'w').write(k.text) # write out content to policy directory file
adir = self.mkdir(self.dir/'about')
for fxml in self.metadata.findall('about/file'):
fn = fxml.get('filename')
try:
fp = open(adir/fn,'w')
if fxml.text is not None and len(fxml.text):
fp.write(fxml.text)
fp.close()
except Exception as err:
self.errlog('failed to write about file %s, error %s' % (adir/fn, err))
def export_xml_to_directory(self, elem):
'''
Do this recursively. If an element is a descriptor, then put that in its own
subdirectory.
'''
def write_xml(x):
un = x.get('url_name')
if un is None:
self.errlog("Oops! error in export_xml_to_directory, missing url_name:")
self.errlog(x)
elem.attrib.pop('url_name')
if 'url_name_orig' in elem.attrib and self.keep_urls:
elem.attrib.pop('url_name_orig')
if un.__len__() > 182:
un = un[:182-un.__len__()]
edir = self.mkdir(self.dir / x.tag)
open(edir/un + '.xml','w').write(self.pp_xml(x))
return un
#print elem
if elem.tag=='descriptor':
# print "--> %s" % list(elem)
self.export_xml_to_directory(elem[0]) # recurse on children, depth first
elem.tag = elem.get('tag')
elem.set('url_name',elem.get('url_name'))
elem.attrib.pop('tag')
# self.export_xml_to_directory(elem) # recurse on this tag
elif elem.tag==etree.Comment: # comment <!-- foo -->
pass
elif elem.get('url_name') is None:
pass
else:
if elem.findall('.//descriptor'):
for k in elem:
self.export_xml_to_directory(k) # recurse on children
write_xml(elem) # write to file and remove from parent
elem.getparent().remove(elem)
def course_id(self):
return self.course.get('course','')
def errlog(self, msg):
print msg
def mkdir(self,p):
'''p is a path'''
if not p.exists():
p.mkdir()
return p
def pp_xml(self,xml):
os.popen('xmllint --format -o tmp.xml -','w').write(etree.tostring(xml))
return open('tmp.xml').read()
def make_urlname(self, xml, parent=''):
dn = xml.get('display_name','')
s = dn
if not s:
xmlp = xml.getparent()
s = xmlp.get('display_name','') # if no display_name, try to use parent's
if not s:
s = xmlp.tag
s += " " + xml.tag
s = s.encode('ascii', 'xmlcharrefreplace')
map = {'"\':<>?|![]': '',
',/().;=+ ': '_',
'/': '__',
'&': 'and',
}
for m,v in map.items():
for ch in m:
s = s.replace(ch,v)
if dn and s in self.urlnames and parent:
s += '_' + parent
while s in self.urlnames:
m = re.match('(.+?)([0-9]*)$',s)
(s,idx) = m.groups()
idx = int(idx or 0)
s += str(idx+1)
self.urlnames.append(s)
return s
def make_descriptor(self, xml, url_name='', parent=''):
"""
Construct and return a descriptor element for the given element
at the head of xml.
Use url_name for the descriptor, if given.
"""
descriptor = etree.Element('descriptor')
descriptor.set('tag',xml.tag)
uno = xml.get('url_name_orig','')
if self.keep_urls and not url_name and uno and self.is_not_random_urlname(uno):
url_name = uno
if not url_name:
url_name = self.make_urlname(xml, parent=parent)
descriptor.set('url_name',url_name)
xml.set('url_name', url_name)
return descriptor
def add_descriptors(self, xml, parent=''):
'''
Recursively walk through self.course and add descriptors
A descriptor is an intermediate tag, which points to content
via a url_name. These are used by edX to simplify loading
of course content.
'''
for elem in xml:
if self.force_studio_format:
if xml.tag=='sequential' and not elem.tag=='vertical': # studio needs seq -> vert -> other
# move child into vertical
vert = etree.Element('vertical')
elem.addprevious(vert)
vert.append(elem)
elem = vert # continue processing on the vertical
if elem.tag in self.DescriptorTags and not elem.get('url_name',''):
desc = self.make_descriptor(elem, parent=parent)
elem.addprevious(desc)
desc.append(elem) # move descriptor to become new parent of elem
self.add_descriptors(elem, desc.get('url_name')) # recurse
#-----------------------------------------------------------------------------
# tests
def RunTests():
import unittest
class TestXBundle(unittest.TestCase):
def testRoundTrip(self):
print "Testing XBundle round trip import -> export"
xb = XBundle()
cxmls = '''
<course semester="2013_Spring" course="mitx.01">
<chapter display_name="Intro">
<sequential display_name="Overview">
<html display_name="Overview text">
hello world
</html>
</sequential>
<!-- a comment -->
</chapter>
</course>
'''
pxmls = """
<policies semester='2013_Spring'>
<gradingpolicy>y:2</gradingpolicy>
<policy>x:1</policy>
</policies>
"""
xb.set_course(etree.XML(cxmls))
xb.add_policies(etree.XML(pxmls))
xb.add_about_file("overview.html","hello overview")
xbin = str(xb)
tdir = 'testdata'
if not os.path.exists(tdir):
os.mkdir(tdir)
xb.export_to_directory(tdir)
# test round trip
xb2 = XBundle()
xb2.import_from_directory(tdir + '/mitx.01')
xbreloaded = str(xb2)
if not xbin==xbreloaded:
print "xbin"
print xbin
print "xbreloaded"
print xbreloaded
self.assertEqual(xbin,xbreloaded)
ts = unittest.makeSuite(TestXBundle)
ttr = unittest.TextTestRunner()
ttr.run(ts)
#-----------------------------------------------------------------------------
# main
if __name__=='__main__':
def usage():
print "Usage: python xbundle.py [--force-studio] [cmd] [infn] [outfn]"
print "where:"
print " cmd = test: run unit tests"
print " cmd = convert: convert between xbundle and edX directory format"
print " the xbundle filename must end with .xml"
print " --force-studio forces <sequential> to always be followed by <vertical> in export"
print " this makes it compatible with Studio import"
print ""
print "examples:"
print " python xbundle.py convert ../data/edx4edx edx4edx_xbundle.xml"
print " python xbundle.py convert edx4edx_xbundle.xml ./"
if len(sys.argv)<2:
usage()
sys.exit(0)
argc = 1
options = dict(keep_urls=True)
if len(sys.argv)>argc and sys.argv[argc]=='--force-studio':
argc += 1
options['force_studio_format'] = True
cmd = sys.argv[argc]
if cmd=='test':
RunTests()
elif cmd=='convert':
argc += 1
infn = sys.argv[argc]
outfn = sys.argv[argc+1]
xb = XBundle(**options)
if infn.endswith('.xml'):
print "Converting xbundle file '%s' to edX xml directory '%s'" % (infn, outfn)
xb.load(infn)
xb.export_to_directory(outfn)
print "done"
elif outfn.endswith('.xml'):
print "Converting edX xml directory '%s' to xbundle file '%s'" % (infn, outfn)
xb.import_from_directory(infn)
xb.save(outfn)
print "done"
else:
usage()
else:
usage()