-
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
eddc49b
commit 7c5ec17
Showing
4 changed files
with
181 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<span style="float:right;"><a href="https://github.com/RubixML/Extras/blob/master/src/Tokenizers/KMer.php">[source]</a></span> | ||
|
||
# K-mer | ||
K-mers are substrings of sequences such as DNA containing the bases A, T, C, and G with a length of *k*. They are often used in bioinformatics to represent features of a DNA sequence. | ||
|
||
!!! note | ||
K-mers that contain invalid bases will not be generated. | ||
|
||
## Parameters | ||
| # | Name | Default | Type | Description | | ||
|---|---|---|---|---| | ||
| 1 | k | 4 | int | The length of tokenized sequences. | | ||
|
||
## Example | ||
```php | ||
use Rubix\ML\Tokenizers\Whitespace; | ||
|
||
$tokenizer = new KMer(4); | ||
``` | ||
|
||
## Additional Methods | ||
Return the number of k-mers that were dropped due to invalid bases. | ||
```php | ||
public function dropped() : int | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
<?php | ||
|
||
namespace Rubix\ML\Tokenizers; | ||
|
||
use Rubix\ML\Exceptions\InvalidArgumentException; | ||
|
||
/** | ||
* K-mer | ||
* | ||
* K-mers are substrings of sequences such as DNA containing the bases A, T, C, and G with a length of *k*. | ||
* They are often used in bioinformatics to represent features of a DNA sequence. | ||
* | ||
* !!! note | ||
* K-mers that contain invalid bases will not be generated. | ||
* | ||
* @category Machine Learning | ||
* @package Rubix/ML | ||
* @author Andrew DalPino | ||
*/ | ||
class Kmer implements Tokenizer | ||
{ | ||
/** | ||
* The length of tokenized sequences. | ||
* | ||
* @var int | ||
*/ | ||
protected int $k; | ||
|
||
/** | ||
* The number of k-mers that were dropped due to invalid bases. | ||
* | ||
* @var int | ||
*/ | ||
protected int $dropped = 0; | ||
|
||
/** | ||
* @param int $k | ||
* @throws \Rubix\ML\Exceptions\InvalidArgumentException | ||
*/ | ||
public function __construct(int $k = 4) | ||
{ | ||
if ($k < 1) { | ||
throw new InvalidArgumentException('K must be' | ||
. " greater than 1, $k given."); | ||
} | ||
|
||
$this->k = $k; | ||
} | ||
|
||
/** | ||
* Return the number of k-mers that were dropped due to invalid bases. | ||
* | ||
* @return int | ||
*/ | ||
public function dropped() : int | ||
{ | ||
return $this->dropped; | ||
} | ||
|
||
/** | ||
* Tokenize a blob of text. | ||
* | ||
* @internal | ||
* | ||
* @param string $text | ||
* @return list<string> | ||
*/ | ||
public function tokenize(string $text) : array | ||
{ | ||
$p = strlen($text) - $this->k; | ||
|
||
$tokens = []; | ||
|
||
for ($i = 0; $i <= $p; ++$i) { | ||
$token = substr($text, $i, $this->k); | ||
|
||
if (preg_match('/[^ACTG]/', $token, $matches, PREG_OFFSET_CAPTURE)) { | ||
$skip = 1 + (int) $matches[0][1]; | ||
|
||
$i += $skip; | ||
|
||
$this->dropped += $skip; | ||
|
||
continue; | ||
} | ||
|
||
$tokens[] = $token; | ||
} | ||
|
||
return $tokens; | ||
} | ||
|
||
/** | ||
* Return the string representation of the object. | ||
* | ||
* @return string | ||
*/ | ||
public function __toString() : string | ||
{ | ||
return "K-mer (k: {$this->k})"; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
<?php | ||
|
||
namespace Rubix\ML\Tests\Tokenizers; | ||
|
||
use Rubix\ML\Tokenizers\Kmer; | ||
use Rubix\ML\Tokenizers\Tokenizer; | ||
use PHPUnit\Framework\TestCase; | ||
|
||
/** | ||
* @group Tokenizers | ||
* @covers \Rubix\ML\Tokenizers\Kmer | ||
*/ | ||
class KmerTest extends TestCase | ||
{ | ||
/** | ||
* @var \Rubix\ML\Tokenizers\Kmer | ||
*/ | ||
protected $tokenizer; | ||
|
||
/** | ||
* @before | ||
*/ | ||
protected function setUp() : void | ||
{ | ||
$this->tokenizer = new Kmer(4); | ||
} | ||
|
||
/** | ||
* @test | ||
*/ | ||
public function build() : void | ||
{ | ||
$this->assertInstanceOf(Kmer::class, $this->tokenizer); | ||
$this->assertInstanceOf(Tokenizer::class, $this->tokenizer); | ||
} | ||
|
||
/** | ||
* @test | ||
*/ | ||
public function tokenize() : void | ||
{ | ||
$text = 'ACGCGTCGAATTCGNTCGA'; | ||
|
||
$expected = [ | ||
'ACGC', 'CGCG', 'GCGT', 'CGTC', 'GTCG', 'TCGA', 'CGAA', 'GAAT', 'AATT', 'ATTC', 'TTCG', | ||
]; | ||
|
||
$tokens = $this->tokenizer->tokenize($text); | ||
|
||
$this->assertEquals($expected, $tokens); | ||
$this->assertCount(11, $tokens); | ||
} | ||
} |