Skip to content

Commit

Permalink
Implement k-mer tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewdalpino committed Jul 7, 2021
1 parent eddc49b commit 7c5ec17
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
- 1.2.0-beta
- Added Deduplicator extractor
- Implement K-mer tokenizer

- 1.1.0-beta
- Added GELU activation function
Expand Down
25 changes: 25 additions & 0 deletions docs/tokenizers/k-mer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<span style="float:right;"><a href="https://github.com/RubixML/Extras/blob/master/src/Tokenizers/KMer.php">[source]</a></span>

# K-mer
K-mers are substrings of sequences such as DNA containing the bases A, T, C, and G with a length of *k*. They are often used in bioinformatics to represent features of a DNA sequence.

!!! note
K-mers that contain invalid bases will not be generated.

## Parameters
| # | Name | Default | Type | Description |
|---|---|---|---|---|
| 1 | k | 4 | int | The length of tokenized sequences. |

## Example
```php
use Rubix\ML\Tokenizers\Whitespace;

$tokenizer = new KMer(4);
```

## Additional Methods
Return the number of k-mers that were dropped due to invalid bases.
```php
public function dropped() : int
```
102 changes: 102 additions & 0 deletions src/Tokenizers/Kmer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
<?php

namespace Rubix\ML\Tokenizers;

use Rubix\ML\Exceptions\InvalidArgumentException;

/**
* K-mer
*
* K-mers are substrings of sequences such as DNA containing the bases A, T, C, and G with a length of *k*.
* They are often used in bioinformatics to represent features of a DNA sequence.
*
* !!! note
* K-mers that contain invalid bases will not be generated.
*
* @category Machine Learning
* @package Rubix/ML
* @author Andrew DalPino
*/
class Kmer implements Tokenizer
{
/**
* The length of tokenized sequences.
*
* @var int
*/
protected int $k;

/**
* The number of k-mers that were dropped due to invalid bases.
*
* @var int
*/
protected int $dropped = 0;

/**
* @param int $k
* @throws \Rubix\ML\Exceptions\InvalidArgumentException
*/
public function __construct(int $k = 4)
{
if ($k < 1) {
throw new InvalidArgumentException('K must be'
. " greater than 1, $k given.");
}

$this->k = $k;
}

/**
* Return the number of k-mers that were dropped due to invalid bases.
*
* @return int
*/
public function dropped() : int
{
return $this->dropped;
}

/**
* Tokenize a blob of text.
*
* @internal
*
* @param string $text
* @return list<string>
*/
public function tokenize(string $text) : array
{
$p = strlen($text) - $this->k;

$tokens = [];

for ($i = 0; $i <= $p; ++$i) {
$token = substr($text, $i, $this->k);

if (preg_match('/[^ACTG]/', $token, $matches, PREG_OFFSET_CAPTURE)) {
$skip = 1 + (int) $matches[0][1];

$i += $skip;

$this->dropped += $skip;

continue;
}

$tokens[] = $token;
}

return $tokens;
}

/**
* Return the string representation of the object.
*
* @return string
*/
public function __toString() : string
{
return "K-mer (k: {$this->k})";
}
}
53 changes: 53 additions & 0 deletions tests/Tokenizers/KmerTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?php

namespace Rubix\ML\Tests\Tokenizers;

use Rubix\ML\Tokenizers\Kmer;
use Rubix\ML\Tokenizers\Tokenizer;
use PHPUnit\Framework\TestCase;

/**
* @group Tokenizers
* @covers \Rubix\ML\Tokenizers\Kmer
*/
class KmerTest extends TestCase
{
/**
* @var \Rubix\ML\Tokenizers\Kmer
*/
protected $tokenizer;

/**
* @before
*/
protected function setUp() : void
{
$this->tokenizer = new Kmer(4);
}

/**
* @test
*/
public function build() : void
{
$this->assertInstanceOf(Kmer::class, $this->tokenizer);
$this->assertInstanceOf(Tokenizer::class, $this->tokenizer);
}

/**
* @test
*/
public function tokenize() : void
{
$text = 'ACGCGTCGAATTCGNTCGA';

$expected = [
'ACGC', 'CGCG', 'GCGT', 'CGTC', 'GTCG', 'TCGA', 'CGAA', 'GAAT', 'AATT', 'ATTC', 'TTCG',
];

$tokens = $this->tokenizer->tokenize($text);

$this->assertEquals($expected, $tokens);
$this->assertCount(11, $tokens);
}
}

0 comments on commit 7c5ec17

Please sign in to comment.