Added unicode pre- and post-processing

fyngyrz · Jan 7, 2019 · 9645fb5 · 9645fb5
1 parent 4ed3f63
commit 9645fb5
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 8 deletions.
diff --git a/aa_macro.py b/aa_macro.py
@@ -30,12 +30,12 @@ class macro(object):
                  you written your congresscritter about patent and
                  copyright reform yet?
   Incep Date: June 17th, 2015       (for Project)
-     LastRev: January 6th, 2019     (for Class)
-  LastDocRev: January 6th, 2019     (for Class)
+     LastRev: January 7th, 2019     (for Class)
+  LastDocRev: January 7th, 2019     (for Class)
      Version: 
 	"""
 	def version_set(self):
-		return('1.0.133 Beta')
+		return('1.0.134 Beta')
 	"""
  Tab spacing: 4 (set your editor to this for sane formatting while reading)
      Dev Env: OS X 10.6.8, Python 2.6.1 from inception
@@ -467,13 +467,34 @@ class macro() is for authoring by you, the person who ISN'T trying to
 	noembrace -- False (default) or True disables [embrace] built-in
 	noinclude -- False (default) or True disables [include] built-in
 	back ------- ffffff (default) HEX3 or HEX color for background color in HTML 4.01s mode
+	ucin ------- False (default) presumes input is 0-127 ASCII; True is Unicode
+	ucout ------ False (default) output is ASCII; True converts output to Unicode
 	dothis ----- None   (default) you can pass in initial text to be processed here if you like
 	             the object returns the result in its string method:
 					mod = macro(dothis='[style x foo [b]]'{x bar})
-					print mod # prints 'foo bar'
+					print mod # prints 'foo bar' if the data is ASCII
+					if it is unicode, you need to do a little more
+
+	Unicode
+	=======
+	Unicode presents encoding issues for Python 2.7
+	The following examples show how to deal with
+	unicode in the context of aa_macro:
+
+	Processing unicode input to ASCII / HTML output:
+	------------------------------------------------
+	mod = macro(ucin=True)
+	s = mod.unido(testBlock) # s will be ASCII with unicode HTML entities
+
+	Processing Unicode input to Unicode output:
+	-------------------------------------------
+	mod = macro(ucin=True,ucout=True)
+	mod.unido(testBlock)
+	s = mod.uniget() # s will be unicode
 
 	The Rules:
 	----------
+	o Unicode requires specific processing as shown above.
 	o Do not attempt to define one style inside another.
 	o Style names may contain anything but a space or a newline
 	o repeat gets a number or a variable parameter. Nothing else. No nesting in the parameter!
@@ -490,12 +511,14 @@ class macro() is for authoring by you, the person who ISN'T trying to
 	  content can have commas, but the macro system won't see them. Of course, you can't
 	  use that on anything that *needs* commas for parameters. Life is so complicated. :)
 	"""
-	def __init__(self,dothis=None,mode='3.2',back="ffffff",nodinner=False,noshell=False,noinclude=False,noembrace=False,debug=False,locklipath='',lockwepath='',xlimit=0,dlimit=0):
+	def __init__(self,dothis=None,mode='3.2',back="ffffff",nodinner=False,noshell=False,noinclude=False,noembrace=False,debug=False,locklipath='',lockwepath='',xlimit=0,dlimit=0,ucin=False,ucout=False):
 		self.locklipath = locklipath
 		self.lockwepath = lockwepath
 		self.xlimit = xlimit
 		self.dlimit = dlimit
 		self.xdcount = 0
+		self.ucin = ucin
+		self.ucout = ucout
 		self.lipath = locklipath
 		self.wepath = lockwepath
 		self.setMode(mode)
@@ -595,11 +618,68 @@ def __init__(self,dothis=None,mode='3.2',back="ffffff",nodinner=False,noshell=Fa
 		self.months = ['January','February','March','April','may','June','July','August','September','October','November','December']
 		self.setup_urle()
 		if dothis != None:
-			self.do(dothis)
+			if self.ucin == True:
+				self.unido(dothis)
+			else:
+				self.do(dothis)
 
 	def __str__(self):
 		return self.result
 
+	def uniget(self):
+		if self.ucout == True:
+			self.result = self.asciitounicode(self.result)
+		return self.result
+
+	def unido(self,text):
+		tmp = unicode(text)
+		text = self.unicodetoascii(tmp) # convert input unicode to ASCII
+		self.do(text)
+		return self.result
+
+	def asciitounicode(self,text):
+		state = 0 # nothing detected
+		accum = u''
+		o = u''
+		for c in text:
+			if state == 0:		# nothing as yet?
+				if c == u'&':	# ampersand?
+					state = 1	# ampersand!
+				else:
+					o += c
+			elif state == 1:	# ampersand found?
+				if c == u'#':	# hash?
+					state = 2	# hash!
+					accum = u''	# clear accumulator
+				else:			# not a hash, so not an entity encoding
+					state = 0	# abort
+					o += u'&'+c	# flush char, done
+			elif state == 2:	# expecting digits or terminating semicolon
+				if c.isdigit():	# digit?
+					accum += c	# add it to accumulator if so
+				elif c == u';':	# terminating
+					s = u'\\U%08x' % (int(accum))
+					ss= s.decode('unicode-escape')
+					o += ss
+					state = 0
+				else: # bad encoding?
+					o += u'&#'
+					o += accum
+					state = 0
+		return o
+
+	def unicodetoascii(self,text):
+#		global ucrep
+		o = ''
+		n = len(text)
+		for i in range(0,n):
+			try:
+				c = text[i].encode("ascii")
+				o += c
+			except:
+				o += '&#{:d};'.format(ord(text[i]))
+		return o
+
 	def setDebug(self,db):
 		if db != False:
 			self.debug = True

diff --git a/changelog.md b/changelog.md
@@ -6,8 +6,16 @@ This log reflects changes to the aa_macro.py import library. Other changes
 such as to the associated utilities and sample files are not tracked here.
 
 ### Log
+1.0.134
+ * added unicode pre- and post-processing
+
+1.0.133
+ * added [gstyle] and [style] secondary help strings
+ * added [helpg2 stylename] and [helps2 stylename] to return secondary style help strings
+
 1.0.132
- * added [helpg] and [helps] to return style help strings
+ * added [gstyle] and [style] help strings
+ * added [helpg stylename] and [helps stylename] to return style help strings
 
 1.0.131
  * added (digits=decdigits,) option to [round value]

diff --git a/test_aa_macro.py b/test_aa_macro.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 
 import unittest
+import codecs
 from aa_macro import *
 import difflib
 import sys
@@ -58,11 +59,39 @@ def test_aa_macro(self):
 		expect = 'expected.html'
 		badout = 'badoutput.html'
 
+		# read unicode sample:
+		try:
+			fh = codecs.open('unisample.txt', encoding='utf-8')
+			testBlock = fh.read()
+			fh.close()
+			st = str(type(testBlock))
+			if st != "<type 'unicode'>": print 'failure to convert file to unicode'
+		except:
+			print 'failed to read unicode sample'
+
+		# process unicode to ASCII:
+		try:
+			umod = macro(ucin=True)
+			s = umod.unido(testBlock)
+			st = str(type(s))
+			if st != "<type 'str'>": print 'failure to convert unicode to ASCII'
+		except:
+			print 'failed to process unicode to ASCII'
+
+		# process unicode to unicode:
+		try:
+			umod = macro(ucin=True,ucout=True)
+			umod.unido(testBlock)
+			s = umod.uniget()
+			st = str(type(s))
+			if st != "<type 'unicode'>": print 'failure to process unicode to unicode'
+		except:
+			print 'failed to process unicode to unicode'
+
 		rebuild = 1
 		fh = open('mactest.txt')
 		testBlock = fh.read()
 		fh.close()
-#		mod = macro()
 		mod = macro(debug=True)
 		output = mod.do(testBlock)
 		dtrace = mod.getdebug()

diff --git a/unisample.txt b/unisample.txt
@@ -0,0 +1,4 @@
+[b bold]
+🌮💩🍕🍝🍷🐱⚓🔗✅✔️☑️💃⚠️😊
+🍆🥜
+[i italics]