Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements #1

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/vendor/
/composer.lock
9 changes: 8 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
"email": "[email protected]"
}
],
"autoload": {
"psr-4": {
"CodeRevolutionPlugins\\GPT3Encoder\\": "src/"
}
},
"minimum-stability": "stable",
"require": {}
"require": {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should ext-mbstring also be added?

"php": ">7.4"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any particular reason to go with such an old PHP version? 7.4 is past its EOL. I would suggest just going with 8.0 here (security patches stopping in November of this year), if not 8.1 or 8.2

}
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
314 changes: 3 additions & 311 deletions gpt3-encoder.php
Original file line number Diff line number Diff line change
@@ -1,319 +1,11 @@
<?php

function gpt_encode($text)
{
$bpe_tokens = array();
if(empty($text))
{
return $bpe_tokens;
}
$raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json");
$byte_encoder = json_decode($raw_chars, true);
if(empty($byte_encoder))
{
error_log('Failed to load characters.json: ' . $raw_chars);
return $bpe_tokens;
}
$rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json");
$encoder = json_decode($rencoder, true);
if(empty($encoder))
{
error_log('Failed to load encoder.json: ' . $rencoder);
return $bpe_tokens;
}
require_once __DIR__.'/vendor/autoload.php';

$bpe_file = file_get_contents(dirname(__FILE__) . "/vocab.bpe");
if(empty($bpe_file))
{
error_log('Failed to load vocab.bpe');
return $bpe_tokens;
}

preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches);
if(!isset($matches[0]) || count($matches[0]) == 0)
{
error_log('Failed to match string: ' . $text);
return $bpe_tokens;
}
$lines = preg_split('/\r\n|\r|\n/', $bpe_file);
$bpe_merges = array();
$bpe_merges_temp = array_slice($lines, 1, count($lines), true);
foreach($bpe_merges_temp as $bmt)
{
$split_bmt = preg_split('#(\s+)#', $bmt);
$split_bmt = array_filter($split_bmt, 'gpt_my_filter');
if(count($split_bmt) > 0)
{
$bpe_merges[] = $split_bmt;
}
}
$bpe_ranks = gpt_dictZip($bpe_merges, range(0, count($bpe_merges) - 1));

$cache = array();
foreach($matches[0] as $token)
{
$new_tokens = array();
$chars = array();
$token = utf8_encode($token);
if(function_exists('mb_strlen'))
{
$len = mb_strlen($token, 'UTF-8');
for ($i = 0; $i < $len; $i++)
{
$chars[] = mb_substr($token, $i, 1, 'UTF-8');
}
}
else
{
$chars = str_split($token);
}
$result_word = '';
foreach($chars as $char)
{
if(isset($byte_encoder[gpt_unichr($char)]))
{
$result_word .= $byte_encoder[gpt_unichr($char)];
}
}
$new_tokens_bpe = gpt_bpe($result_word, $bpe_ranks, $cache);
$new_tokens_bpe = explode(' ', $new_tokens_bpe);
foreach($new_tokens_bpe as $x)
{
if(isset($encoder[$x]))
{
$new_tokens[$x] = $encoder[$x];
}
else
{
$new_tokens[$x] = $x;
}
}
foreach($new_tokens as $ninx => $nval)
{
if(isset($bpe_tokens[$ninx]))
{
$bpe_tokens[] = $nval;
}
else
{
$bpe_tokens[$ninx] = $nval;
}
}
}
return $bpe_tokens;
}

function gpt_my_filter($var)
{
return ($var !== NULL && $var !== FALSE && $var !== '');
}

function gpt_unichr($c)
{
if (ord($c[0]) >=0 && ord($c[0]) <= 127)
{
return ord($c[0]);
}
if (ord($c[0]) >= 192 && ord($c[0]) <= 223)
{
return (ord($c[0])-192)*64 + (ord($c[1])-128);
}
if (ord($c[0]) >= 224 && ord($c[0]) <= 239)
{
return (ord($c[0])-224)*4096 + (ord($c[1])-128)*64 + (ord($c[2])-128);
}
if (ord($c[0]) >= 240 && ord($c[0]) <= 247)
{
return (ord($c[0])-240)*262144 + (ord($c[1])-128)*4096 + (ord($c[2])-128)*64 + (ord($c[3])-128);
}
if (ord($c[0]) >= 248 && ord($c[0]) <= 251)
{
return (ord($c[0])-248)*16777216 + (ord($c[1])-128)*262144 + (ord($c[2])-128)*4096 + (ord($c[3])-128)*64 + (ord($c[4])-128);
}
if (ord($c[0]) >= 252 && ord($c[0]) <= 253)
{
return (ord($c[0])-252)*1073741824 + (ord($c[1])-128)*16777216 + (ord($c[2])-128)*262144 + (ord($c[3])-128)*4096 + (ord($c[4])-128)*64 + (ord($c[5])-128);
}
if (ord($c[0]) >= 254 && ord($c[0]) <= 255)
{
return 0;
}
return 0;
}
function gpt_dictZip($x, $y)
{
$result = array();
$cnt = 0;
foreach($x as $i)
{
if(isset($i[1]) && isset($i[0]))
{
$result[$i[0] . ',' . $i[1]] = $cnt;
$cnt++;
}
}
return $result;
}
function gpt_get_pairs($word)
{
$pairs = array();
$prev_char = $word[0];
for ($i = 1; $i < count($word); $i++)
{
$char = $word[$i];
$pairs[] = array($prev_char, $char);
$prev_char = $char;
}
return $pairs;
}
function gpt_split($str, $len = 1)
{
$arr = [];
if(function_exists('mb_strlen'))
{
$length = mb_strlen($str, 'UTF-8');
}
else
{
$length = strlen($str);
}

for ($i = 0; $i < $length; $i += $len)
{
if(function_exists('mb_substr'))
{
$arr[] = mb_substr($str, $i, $len, 'UTF-8');
}
else
{
$arr[] = substr($str, $i, $len);
}
}
return $arr;

}
function gpt_bpe($token, $bpe_ranks, &$cache)
{
if(array_key_exists($token, $cache))
{
return $cache[$token];
}
$word = gpt_split($token);
$init_len = count($word);
$pairs = gpt_get_pairs($word);
if(!$pairs)
{
return $token;
}
while (true)
{
$minPairs = array();
foreach($pairs as $pair)
{
if(array_key_exists($pair[0] . ','. $pair[1], $bpe_ranks))
{
$rank = $bpe_ranks[$pair[0] . ','. $pair[1]];
$minPairs[$rank] = $pair;
}
else
{
$minPairs[10e10] = $pair;
}
}
ksort($minPairs);
$min_key = array_key_first($minPairs);
foreach($minPairs as $mpi => $mp)
{
if($mpi < $min_key)
{
$min_key = $mpi;
}
}
$bigram = $minPairs[$min_key];
if(!array_key_exists($bigram[0] . ',' . $bigram[1], $bpe_ranks))
{
break;
}
$first = $bigram[0];
$second = $bigram[1];
$new_word = array();
$i = 0;
while ($i < count($word))
{
$j = gpt_indexOf($word, $first, $i);
if ($j === -1)
{
$new_word = array_merge($new_word, array_slice($word, $i, null, true));
break;
}
if($i > $j)
{
$slicer = array();
}
elseif($j == 0)
{
$slicer = array();
}
else
{
$slicer = array_slice($word, $i, $j - $i, true);
}
$new_word = array_merge($new_word, $slicer);
if(count($new_word) > $init_len)
{
break;
}
$i = $j;
if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second)
{
array_push($new_word, $first . $second);
$i = $i + 2;
}
else
{
array_push($new_word, $word[$i]);
$i = $i + 1;
}
}
if($word == $new_word)
{
break;
}
$word = $new_word;
if (count($word) === 1)
{
break;
}
else
{
$pairs = gpt_get_pairs($word);
}
}
$word = implode(' ', $word);
$cache[$token] = $word;
return $word;
}
function gpt_indexOf($arrax, $searchElement, $fromIndex)
{
$index = 0;
foreach($arrax as $index => $value)
{
if($index < $fromIndex)
{
$index++;
continue;
}
if($value == $searchElement)
{
return $index;
}
$index++;
}
return -1;
}
use CodeRevolutionPlugins\GPT3Encoder\Encoder;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole file would serve better as an example in a README


$prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890";
$token_array = gpt_encode($prompt);
$token_array = Encoder::instance()->encode($prompt);
error_log('Token array: ' . print_r($token_array, true));
error_log('Count: ' . count($token_array));

Expand Down
Loading