From c44025430c0ab17299359db3ecb9917f5da74902 Mon Sep 17 00:00:00 2001 From: David Barbour Date: Mon, 14 Jul 2014 20:24:23 -0500 Subject: [PATCH] adjusting code for the new resource model --- AboutABC.md | 2 +- ao.cabal | 2 +- doc/Compilation.md | 13 +-- hsrc/ABC/Hash.hs | 39 ------- hsrc/ABC/Imperative/JIT.hs | 8 +- hsrc/ABC/Resource.hs | 203 +++++++++++++++++++++++++++++++++++++ hsrc/AO/Precompile.hs | 62 ++++++----- 7 files changed, 246 insertions(+), 83 deletions(-) delete mode 100644 hsrc/ABC/Hash.hs create mode 100644 hsrc/ABC/Resource.hs diff --git a/AboutABC.md b/AboutABC.md index c9b2647..2f76a92 100644 --- a/AboutABC.md +++ b/AboutABC.md @@ -71,7 +71,7 @@ Effectful tokens are typically specific to a virtual machine or runtime environm * `{&ann}` - annotation, identity behavior, for performance and debugging. * `{:seal}` and `{.seal}` - sealers and unsealers for rights amplification. -* `{#abcResourceName}` - link and load, separate compilation +* `{#abcResourceToken}` - link and load, separate compilation These are discussed with more detail in later sections. diff --git a/ao.cabal b/ao.cabal index 86fb6b7..3d0ff5a 100644 --- a/ao.cabal +++ b/ao.cabal @@ -99,7 +99,7 @@ Library ABC.Operators ABC.Quote ABC.Simplify - ABC.Hash + ABC.Resource ABC.Imperative.Value ABC.Imperative.Operations diff --git a/doc/Compilation.md b/doc/Compilation.md index 4240765..ea44bf6 100644 --- a/doc/Compilation.md +++ b/doc/Compilation.md @@ -21,19 +21,12 @@ At the moment, dynamic compilation is supported by use of an `{&compile}` annota I am also interested in supporting 'static' compilation, per word in the dictionary. Per-word compilation can potentially mitigate exponential expansion overheads associated with AO's 'inline everything' model, and jumpstart early use of external bytecode resources. -The question I have is how to best control per-word compilation. Some possibilities: +The question I have is how to best guide per-word compilation. -* Use a prefix, e.g. `#` so `#foo` is implicitly targeted for compilation -* Define a separate preCompiledWords file or definition -* Annotate the `foo` definition for compilation -* Define `compile.foo` for each `foo` that I wish to precompile -* Heuristically infer words for compilation +Use of a prefix (e.g. words starting with `#` are compiled) initially appealed to me, but with hindsight it's hard to understand why. A tight coupling between names and performance is very painful during development, when we're trying to tweak the performance. Also, it would not compose well if we tried to use the same convention to guide other aspects of implementation. -Use of a prefix initially appealed to me, but with hindsight it's hard to understand why. It seems a tight coupling between names and performance is very painful during development. Use of a separate file seems like it might be disadvantageous long-term, and is contrary to my dictionary-as-OS concept. Use of a single compilation word centralizes a lot of management, and thus requires careful administration. Annotations don't really have a well-defined scope as components, which can make them more difficult to process. +So, the guidance should be separate from the word itself. The most promising possibility is to simply define `compile!foo` for every word `foo` we wish to compile (ignoring the actual contents of `compile!foo`). -It seems to me the best option is to focus on per-word compilation, and perhaps the heuristic approach. - -An interesting possibility is to automatically combine these. I could try to use a word like `compile.foo` to *modify* the compilation heuristic in the particular case of `foo`. ## Multiple Compilers? diff --git a/hsrc/ABC/Hash.hs b/hsrc/ABC/Hash.hs deleted file mode 100644 index a86d508..0000000 --- a/hsrc/ABC/Hash.hs +++ /dev/null @@ -1,39 +0,0 @@ - --- | ABC has a non-conventional approach to separate compilation and --- linking: an external ABC resource is named by secure hash of its --- bytecode, i.e. `{#secureHashOfResource}`. The hash in question is --- the SHA3-384, encoded via base64url. --- --- The bytecode `{#secureHashOfResource}` will apply the identified --- ABC resource to the tacit value. Of course, this may require the --- runtime to download the resource, and serves as an opportunity to --- separately typecheck, compile, and cache the resource for reuse. --- Careful use of secure hash resources can mitigate the otherwise --- exponential size of ABC streams, reducing memory and cache burden --- and potentially saving bandwidth in a distributed stream. --- --- Anyhow, hashing ABC code comes up often enough, and in a standard --- manner to support linking. This module provides the function for it. -module ABC.Hash (abcHash) where - -import qualified Data.ByteString as B -import qualified Data.Byteable as B -import qualified Data.Text as T -import qualified Data.Text.Encoding as T -import qualified Crypto.Hash as CH -import qualified Data.ByteString.Base64.URL as B64 -import ABC.Operators - --- | abcHash takes an ABC sequence and returns a base64url string... --- which will have length 64 characters. -abcHash :: [Op] -> String -abcHash = toBase64 . B.toBytes . sha3_384 . T.encodeUtf8 . T.pack . show - --- the type declaration selects the hash function (yuck) -sha3_384 :: B.ByteString -> CH.Digest CH.SHA3_384 -sha3_384 = CH.hash - -toBase64 :: B.ByteString -> String -toBase64 = fmap toChar . B.unpack . B64.encode where - toChar = toEnum . fromIntegral - diff --git a/hsrc/ABC/Imperative/JIT.hs b/hsrc/ABC/Imperative/JIT.hs index c25ea1a..7176735 100644 --- a/hsrc/ABC/Imperative/JIT.hs +++ b/hsrc/ABC/Imperative/JIT.hs @@ -29,15 +29,15 @@ import qualified System.Environment as Env import qualified Control.Exception as Err import ABC.Operators -import ABC.Hash +import ABC.Resource -- | compute a cryptographically unique module name for ABC code --- (will always be equivalent to `hashToModuleName . abcHash`) +-- (equivalent to `hashToModuleName . tail . abcResourceToken`) abcToModuleName :: [Op] -> String -abcToModuleName = hashToModuleName . abcHash +abcToModuleName = hashToModuleName . L.tail . abcResourceToken -- | compute a module name from the hash of the ABC code --- (lossy, but retains at least 320 bits for uniqueness) +-- (lossy, but retains 320 bits from 384 for uniqueness) hashToModuleName :: String -> String hashToModuleName s = let (a,a') = L.splitAt 2 (fmap modChar s) in diff --git a/hsrc/ABC/Resource.hs b/hsrc/ABC/Resource.hs new file mode 100644 index 0000000..3e20737 --- /dev/null +++ b/hsrc/ABC/Resource.hs @@ -0,0 +1,203 @@ +{-# LANGUAGE ViewPatterns #-} + +-- | ABC has a non-conventional approach to separate compilation and +-- linking: ABC resources are given deterministic, cryptographically +-- unique names using secure hash of the bytecode, and ABC can invoke +-- these resources by name to logically inline the bytecode. +-- +-- To support integration with content distribution networks, ABC +-- resources are encrypted. The decryption key is the secure hash of +-- the bytecode. The full name includes both the decryption key and +-- the secure hash of the cipher text to quickly look up the resource. +-- +-- The full ABC invocation looks like: +-- +-- {#secureHashOfCiphertext:secureHashOfBytecode} +-- +-- Pseudocode for resource construction: +-- +-- given bytecode +-- encryptionKey = secureHashBC(bytecode) +-- cipherText = encrypt(compress(bytecode),encryptionKey) +-- lookupKey = secureHashCT(cipherText) +-- store(lookupKey,cipherText) +-- resourceId = lookupKey:encryptionKey +-- return resourceId +-- +-- Pseudocode for resource acquisition: +-- +-- given resourceId +-- extract lookupKey, encryptionKey from resourceId +-- cipherText = fetch(lookupKey) +-- validate(lookupKey == secureHashCT(cipherText)) +-- bytecode = decompress(decrypt(cipherText),encryptionKey) +-- validate(encryptionKey == secureHashBC(bytecode)) +-- validateABC(bytecode) +-- return bytecode +-- +-- Separate compilation is opportunistic. We expect reuse of names, +-- so we can cache and compile the bytecode associated with a given +-- resourceId. +-- +-- Not all details are settled. Some relatively stable decisions: +-- +-- secureHash CT,BC: are independent halves of SHA3-384 +-- base64url encoding of secure hashes in resource ID +-- AES encryption, CTR mode, nonce simple function of key +-- simple, deterministic, unambiguous compression algorithm +-- +-- This module implements the standard resource model as far as it +-- has been implemented, stubbing the elements that haven't been +-- decided yet. +-- +module ABC.Resource + ( HashBC, HashCT + , CipherText, ResourceToken + , secureHashBC, secureHashCT + , makeResource, loadResource + , abcResourceToken + ) where + +import Data.Functor.Identity +import qualified Data.List as L +import Data.ByteString (ByteString) +import Data.Maybe (mapMaybe) +import qualified Data.ByteString as B +import qualified Data.Byteable as B +import qualified Data.Text as T +import qualified Data.Text.Encoding as T +import qualified Text.Read as R +import qualified Crypto.Hash as CH +import qualified Data.ByteString.Base64.URL as B64 +import ABC.Operators + +-- | HashCT and HashBC are 24 octet bytestrings +type HashBC = ByteString +type HashCT = ByteString + +-- | encrypted, compressed bytecode +type CipherText = ByteString + +-- | A resource token is the text that goes between curly braces +-- for invocation in ABC. I.e. in this case, it has the form: +-- #secureHashOfCiphertext:secureHashOfBytecode +-- with the secure hashes encoded in base64url. +type ResourceToken = String + + +-- | generate secure hash for a series of operations. +-- (second half of SHA3-384 on UTF-8 encoding of ABC) +secureHashBC :: [Op] -> HashBC +secureHashBC = secureHashBC' . encodeABC + +-- | encode ABC as a bytestring +encodeABC :: [Op] -> ByteString +encodeABC = T.encodeUtf8 . T.pack . show + +-- | decode bytestring as ABC program +decodeABC :: ByteString -> Maybe [Op] +decodeABC bcBytes = + case T.decodeUtf8' bcBytes of + Left _ -> Nothing + Right txt -> + let s = T.unpack txt in + case R.readList s of + [(ops,"")] -> Just ops + _ -> Nothing + +-- generate secure hash for the bytecode from bytestring +secureHashBC' :: ByteString -> HashBC +secureHashBC' = B.drop 24 . sha3_384 + +-- | secure hash for the ciphertext (used as lookup key) +-- (first half of SHA3-384) +secureHashCT :: CipherText -> HashCT +secureHashCT = B.take 24 . sha3_384 + +-- | given a storage function and resource, create the ABC resource +-- and return deterministic, unique resource token. This token can +-- later be used with loadResource to obtain the bytecode. +makeResource :: (Monad m) => (HashCT -> CipherText -> m ()) -> [Op] -> m ResourceToken +makeResource fnStore bytecode = + let bcBytes = encodeABC bytecode in + let hashBC = secureHashBC' bcBytes in + let cipherText = encrypt hashBC (compress bcBytes) in + let hashCT = secureHashCT cipherText in + fnStore hashCT cipherText >> + let resourceId = "#" ++ toBase64 hashCT ++ ":" ++ toBase64 hashBC in + return resourceId + +-- | purely compute the resource token without storing the resource +abcResourceToken :: [Op] -> ResourceToken +abcResourceToken = runIdentity . makeResource nullStore where + nullStore _hashCT _cipherText = return () + +-- | given a resource loading function, access an ABC resource via +-- token. In unlikely case of HashCT collisions, multiple candidate +-- cipher texts can be returned. The bytecode hash will provide an +-- additional uniqueness filter. +-- +-- loadResource will validate that all hashes are matched and that +-- the bytecode can parse as ABC. It will fail if no valid resources +-- are loaded, or (very improbably!) in case of full 384-bit secure +-- hash collision. +-- +loadResource :: (Monad m) => (HashCT -> m [CipherText]) -> ResourceToken -> m [Op] +loadResource fnLoad tok@(splitToken -> Just (hashCT,hashBC)) = + fnLoad hashCT >>= \ lLoadedTexts -> + let lMatchHashCT = L.filter ((== hashCT) . secureHashCT) lLoadedTexts in + let lbcBytes = L.nub $ fmap (decompress . decrypt hashBC) lMatchHashCT in + let lMatchHashBC = L.filter ((== hashBC) . secureHashBC') lbcBytes in + let lOps = mapMaybe decodeABC lMatchHashBC in + case lOps of + [] -> fail ("ABC resource " ++ tok ++ " not found") -- e.g. network failure + [ops] -> return ops + _ -> fail ("ABC resource " ++ tok ++ " is ambiguous") -- secure hash collision! +loadResource _ tok = fail $ "invalid resource token: " ++ tok + +-- extract information from a resource token +splitToken :: ResourceToken -> Maybe (HashCT, HashBC) +splitToken ('#':rscid) = + let (rct,crbc) = L.splitAt 32 rscid in + case crbc of + (':':rbc) -> + case (fromBase64 rct, fromBase64 rbc) of + (Just hct, Just hbc) -> + -- ensure 192 bits for each hash + let okSize = (24 == B.length hct) && (24 == B.length hbc) in + if okSize then Just (hct,hbc) else Nothing + _ -> Nothing + _ -> Nothing -- not a valid token +splitToken _ = Nothing + +-- the type declaration selects the hash function (yuck) +sha3_384 :: ByteString -> ByteString +sha3_384 = B.toBytes . sha3_384' + +sha3_384' :: ByteString -> CH.Digest CH.SHA3_384 +sha3_384' = CH.hash + +toBase64 :: ByteString -> String +toBase64 = fmap toChar . B.unpack . B64.encode where + toChar = toEnum . fromIntegral + +fromBase64 :: String -> Maybe ByteString +fromBase64 = e2mb . B64.decode . T.encodeUtf8 . T.pack where + e2mb = either (const Nothing) Just + + +-- todo: implement encryption +encrypt :: HashBC -> ByteString -> CipherText +encrypt _key = id + +-- todo: implement decryption +decrypt :: HashBC -> CipherText -> ByteString +decrypt _key = id + +-- todo: implement compression +compress :: ByteString -> ByteString +compress = id + +-- todo: implement decompression +decompress :: ByteString -> ByteString +decompress = id diff --git a/hsrc/AO/Precompile.hs b/hsrc/AO/Precompile.hs index e220ad4..4f3f410 100644 --- a/hsrc/AO/Precompile.hs +++ b/hsrc/AO/Precompile.hs @@ -1,25 +1,29 @@ --- | This is an idea for partially pre-compiling the AO dictionary. +-- | AO's 'inline everything' semantics are simple for reasoning, but +-- not efficient. It makes poor reuse of memory, CPU cache, separate +-- compilation, and bandwidth. To mitigate this, AO systems leverage +-- ABC's separate compilation and linking model. Any sequence of ABC +-- can be given a cryptographically unique resource token, which may +-- then be invoked to logically inline the associated ABC. -- --- The convention for deciding which words to precompile has not yet --- settled. The encoding below compiles words starting with `#`, but --- it is already obvious that this was a bad idea. We might instead --- compile words for which `compile.foo` is defined, or a variation --- on that. +-- See module ABC.Resource for more about the resource model. -- --- Precompiled words will show up in the resulting ABC using the --- full provider-independent capability: +-- Long term, the idea is that we should use machine learning to chop +-- large ABC programs into highly reusable, near-optimal components. -- --- {#secureHashOfCiphertext:secureHashOfBytecode} --- --- Which is to say, precompiled words is aimed to jumpstart Awelon's --- distribution and separate compilation features. A runtime that --- supports precompiled words can easily be extended to download the --- resources from a remote server. +-- But this module is concerned with a short term solution. A subset +-- of words in the AO dictionary will be treated as ABC resources by +-- replacing their definitions with the appropriate invocation. The +-- ABC resources may truly be compiled separately for performance. +-- +-- The selection of words for compilation is based on convention. At +-- this time, we simply compile every word `foo` for which there is +-- a word `compile!foo` defined in the dictionary. This is rather +-- ad-hoc, but it will do the job well enough for now. -- --- This module will output the code segments that should be further --- compiled, along with their identifiers. Further compilation is --- left to the runtime. +-- TODO: eventually, I need to support sensitivity concerns, i.e. to +-- compile sensitive ABC modules using an unguessable secret to guard +-- against confirmation attacks. -- module AO.Precompile ( preCompileDict @@ -38,7 +42,7 @@ import AO.Compile import AO.InnerDict import ABC.Operators -import ABC.Hash +import ABC.Resource import ABC.Quote import ABC.Simplify @@ -47,20 +51,20 @@ type PreCompD = M.Map HashString [Op] -- hash string to operators type InnerD md = M.Map Word (AO_Code, md) -- original or final code type PCX = (M.Map Word HashString, M.Map HashString [Op]) --- | precompile all words whose names start with `#`. +-- | precompile all words for which `compile!word` is defined -- --- Precompiled code is emitted as a map of abcHash values to --- Awelon bytecode, pre-simplified but otherwise unmodified. +-- Precompiled code is emitted as a map of abcResourceToken values +-- to Awelon bytecode, pre-simplified but otherwise unmodified. preCompileDict :: AODict md -> (AODict md, PreCompD) preCompileDict = flip evalState (M.empty,M.empty) . runPreCompile --- find words starting with '#' -isPCW :: Word -> Bool -isPCW = maybe False ((== '#') . fst) . T.uncons +isPCW :: M.Map Word a -> Word -> Bool +isPCW d w = M.member cw d where + cw = T.pack "compile!" `T.append` w runPreCompile :: AODict md -> State PCX (AODict md, PreCompD) runPreCompile (AODict d0) = - let lTargets = L.filter isPCW $ M.keys d0 in + let lTargets = L.filter (isPCW d0) $ M.keys d0 in mapM_ (preComp d0) lTargets >> -- accumulates in state get >>= \ (hsWords,preCompD) -> let df = foldr updateWord d0 (M.toList hsWords) in @@ -69,7 +73,7 @@ runPreCompile (AODict d0) = updateWord :: (Word, HashString) -> InnerD md -> InnerD md updateWord (w,hs) = M.update fn w where fn (_,meta) = pure (code',meta) - code' = [AO_Tok ('#':hs)] -- single token + code' = [AO_Tok hs] -- single token -- compile a word to a hash string preComp :: InnerD md -> Word -> State PCX HashString @@ -80,7 +84,9 @@ preComp d w = Nothing -> let code = fst (d M.! w) in simplify <$> (aoCodeToABC d code) >>= \ ops -> - let hs = abcHash ops in + -- todo: modify this to more directly construct + -- both encrypted and decrypted storage for tokens. + let hs = abcResourceToken ops in get >>= \ (mW,mHS) -> put (M.insert w hs mW, M.insert hs ops mHS) >> return hs @@ -92,7 +98,7 @@ aoCodeToABC _ [] = return [] -- obtain bytecode, translating precompiled words to tokens aoActionToABC :: InnerD md -> AO_Action -> State PCX [Op] -aoActionToABC d (AO_Word w) | isPCW w = preComp d w >>= \ hs -> return [Tok ('#':hs)] +aoActionToABC d (AO_Word w) | isPCW d w = preComp d w >>= \ hs -> return [Tok ('#':hs)] | otherwise = aoCodeToABC d (fst (d M.! w)) aoActionToABC d (AO_Block aoOps) = aoCodeToABC d aoOps >>= \ ops -> return [BL ops] aoActionToABC _ (AO_Num r) = return $ quotes r [Op_l]