-
-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8e0a3bd
commit 8c90e2a
Showing
1 changed file
with
55 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
<!DOCTYPE html><html><head><title>documentation: Projection</title><meta charset="utf-8" /><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" /><meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="author" content="Marcin Jakubowski" /><meta name="description" content="Read and write Parquet files using Scala" /><meta name="og:image" content="/parquet4s/img/poster.png" /><meta name="image" property="og:image" content="/parquet4s/img/poster.png" /><meta name="og:title" content="documentation: Projection" /><meta name="title" property="og:title" content="documentation: Projection" /><meta name="og:site_name" content="documentation" /><meta name="og:url" content="" /><meta name="og:type" content="website" /><meta name="og:description" content="Read and write Parquet files using Scala" /><link rel="icon" type="image/png" href="/parquet4s/img/favicon.png" /><meta name="twitter:title" content="documentation: Projection" /><meta name="twitter:image" content="/parquet4s/img/poster.png" /><meta name="twitter:description" content="Read and write Parquet files using Scala" /><meta name="twitter:card" content="summary_large_image" /><link rel="icon" type="image/png" sizes="16x16" href="/parquet4s/img/favicon-16x16.png" /><link rel="icon" type="image/png" sizes="32x32" href="/parquet4s/img/favicon-32x32.png" /><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" /><link rel="stylesheet" href="/parquet4s/highlight/styles/vs.css" /><link rel="stylesheet" href="/parquet4s/css/light-style.css" /></head><body class="docs"><div id="wrapper"><div id="sidebar-wrapper"><div id="sidebar-brand"><a href="/parquet4s/" class="brand"><div class="brand-wrapper"></div><span>documentation</span></a><button id="main-toggle" class="sidebar-toggle"><span class="close"></span></button></div><div class="sidebar-nav"> <div class="sidebar-nav-item "><a href="/parquet4s/docs" title="Introduction" class="">Introduction</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/quick_start" title="Quick Start" class="">Quick Start</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/akka" title="Integration with Akka Streams" class="">Integration with Akka Streams</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/pekko" title="Integration with Pekko Streams" class="">Integration with Pekko Streams</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/fs2" title="Integration with FS2" class="">Integration with FS2</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/storage_types" title="Supported storage types" class="">Supported storage types</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/records_and_schema" title="Records, types and schema" class="">Records, types and schema</a></div> <div class="sidebar-nav-item active "><a href="/parquet4s/docs/projection" title="Projection" class="active">Projection</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/filtering" title="Filtering" class="">Filtering</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/partitioning" title="Partitioning" class="">Partitioning</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/statistics" title="Statistics" class="">Statistics</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/examples" title="Examples" class="">Examples</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/migration" title="Migration from 1.x" class="">Migration from 1.x</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/etl" title="(Experimental) ETL" class="">(Experimental) ETL</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/protobuf" title="(Experimental) Protobuf with ScalaPB" class="">(Experimental) Protobuf with ScalaPB</a></div> <div class="sidebar-nav-item "><a href="/parquet4s/docs/sponsors" title="Distinguished Sponsors" class="">Distinguished Sponsors</a></div></div></div><div id="page-content-wrapper"><div class="nav"><div class="container-fluid"><div class="row"><div class="col-lg-12"><div class="action-menu pull-left clearfix"><a href="#menu-toggle" id="menu-toggle"><i class="fa fa-bars" aria-hidden="true"></i></a></div><ul class="pull-right"><li class="search-nav"><div id="search-dropdown"><label><i class="fa fa-search"></i>Search</label><input id="search-bar" type="text" placeholder="Enter keywords here..." onclick="displayToggleSearch(event)" /><ul id="search-dropdown-content" class="dropdown dropdown-content"></ul></div></li><li id="gh-eyes-item" class="hidden-xs to-uppercase"><a href="https://github.com/mjakubowski84/parquet4s" target="_blank" rel="noopener noreferrer"><i class="fa fa-eye"></i><span>Watchers<span id="eyes" class="label label-default">--</span></span></a></li><li id="gh-stars-item" class="hidden-xs to-uppercase"><a href="https://github.com/mjakubowski84/parquet4s" target="_blank" rel="noopener noreferrer"><i class="fa fa-star-o"></i><span>Stars<span id="stars" class="label label-default">--</span></span></a></li></ul></div></div></div></div><div id="content" data-github-owner="mjakubowski84" data-github-repo="parquet4s"><div class="content-wrapper"><section><h1 id="projection">Projection</h1> | ||
|
||
<p>Schema projection is a way of optimization of reads. When calling <code class="language-plaintext highlighter-rouge">ParquetReader.as[MyData]</code> Parquet4s reads the whole content of each Parquet record even when you provide a case class that maps only a part of stored columns. The same happens when you use generic records by calling <code class="language-plaintext highlighter-rouge">ParquetReader.generic</code>. However, you can explicitly tell Parquet4s to use a different schema. In effect, all columns not matching your schema will be skipped and not read. You can define the projection schema in numerous ways:</p> | ||
|
||
<ol> | ||
<li>by defining case class for typed read using <code class="language-plaintext highlighter-rouge">projectedAs</code>,</li> | ||
<li>by defining generic column projection (allows reference to nested fields and aliases) using <code class="language-plaintext highlighter-rouge">projectedGeneric</code>,</li> | ||
<li>by providing your own instance of Parquet’s <code class="language-plaintext highlighter-rouge">MessageType</code> for generic read using <code class="language-plaintext highlighter-rouge">projectedGeneric</code>.</li> | ||
</ol> | ||
|
||
<div class="language-scala highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">import</span> <span class="nn">com.github.mjakubowski84.parquet4s.</span><span class="o">{</span><span class="nc">Col</span><span class="o">,</span> <span class="nc">ParquetIterable</span><span class="o">,</span> <span class="nc">ParquetReader</span><span class="o">,</span> <span class="nc">Path</span><span class="o">,</span> <span class="nc">RowParquetRecord</span><span class="o">}</span> | ||
<span class="k">import</span> <span class="nn">org.apache.parquet.schema.MessageType</span> | ||
|
||
<span class="c1">// typed read</span> | ||
<span class="k">case</span> <span class="k">class</span> <span class="nc">MyData</span><span class="o">(</span><span class="n">column1</span><span class="k">:</span> <span class="kt">Int</span><span class="o">,</span> <span class="n">columnX</span><span class="k">:</span> <span class="kt">String</span><span class="o">)</span> | ||
<span class="k">val</span> <span class="nv">myData</span><span class="k">:</span> <span class="kt">ParquetIterable</span><span class="o">[</span><span class="kt">MyData</span><span class="o">]</span> <span class="k">=</span> | ||
<span class="nc">ParquetReader</span> | ||
<span class="o">.</span><span class="py">projectedAs</span><span class="o">[</span><span class="kt">MyData</span><span class="o">]</span> | ||
<span class="o">.</span><span class="py">read</span><span class="o">(</span><span class="nc">Path</span><span class="o">(</span><span class="s">"file.parquet"</span><span class="o">))</span> | ||
|
||
<span class="c1">// generic read with column projection</span> | ||
<span class="k">val</span> <span class="nv">records1</span><span class="k">:</span> <span class="kt">ParquetIterable</span><span class="o">[</span><span class="kt">RowParquetRecord</span><span class="o">]</span> <span class="k">=</span> | ||
<span class="nc">ParquetReader</span> | ||
<span class="o">.</span><span class="py">projectedGeneric</span><span class="o">(</span> | ||
<span class="nc">Col</span><span class="o">(</span><span class="s">"column1"</span><span class="o">).</span><span class="py">as</span><span class="o">[</span><span class="kt">Int</span><span class="o">],</span> | ||
<span class="nc">Col</span><span class="o">(</span><span class="s">"columnX"</span><span class="o">).</span><span class="py">as</span><span class="o">[</span><span class="kt">String</span><span class="o">].</span><span class="py">alias</span><span class="o">(</span><span class="s">"my_column"</span><span class="o">),</span> | ||
<span class="o">)</span> | ||
<span class="o">.</span><span class="py">read</span><span class="o">(</span><span class="nc">Path</span><span class="o">(</span><span class="s">"file.parquet"</span><span class="o">))</span> | ||
|
||
<span class="c1">// generic read with own instance of Parquet schema</span> | ||
<span class="k">val</span> <span class="nv">schemaOverride</span><span class="k">:</span> <span class="kt">MessageType</span> <span class="o">=</span> <span class="o">???</span> | ||
<span class="k">val</span> <span class="nv">records2</span><span class="k">:</span> <span class="kt">ParquetIterable</span><span class="o">[</span><span class="kt">RowParquetRecord</span><span class="o">]</span> <span class="k">=</span> | ||
<span class="nc">ParquetReader</span> | ||
<span class="o">.</span><span class="py">projectedGeneric</span><span class="o">(</span><span class="n">schemaOverride</span><span class="o">)</span> | ||
<span class="o">.</span><span class="py">read</span><span class="o">(</span><span class="nc">Path</span><span class="o">(</span><span class="s">"file.parquet"</span><span class="o">))</span> | ||
</code></pre></div></div> | ||
|
||
</section></div></div></div></div><script src="/parquet4s/highlight/highlight.pack.js"></script><script src="/parquet4s/lunr/lunr.js"></script><script> | ||
// For all code blocks, copy the language from the containing div | ||
// to the inner code tag (where hljs expects it to be) | ||
const langPrefix = 'language-'; | ||
document.querySelectorAll(`div[class^='${langPrefix}']`).forEach(function(div) { | ||
div.classList.forEach(function(cssClass) { | ||
if (cssClass.startsWith(langPrefix)) { | ||
const lang = cssClass.substring(langPrefix.length); | ||
div.querySelectorAll('pre code').forEach(function(code) { | ||
code.classList.add(lang); | ||
}); | ||
} | ||
}); | ||
}); | ||
|
||
hljs.configure({languages:['scala','java','bash']}); | ||
hljs.initHighlightingOnLoad(); | ||
</script><script>console.info('\x57\x65\x62\x73\x69\x74\x65\x20\x62\x75\x69\x6c\x74\x20\x77\x69\x74\x68\x3a\x0a\x20\x20\x20\x20\x20\x20\x20\x20\x20\x5f\x5f\x20\x20\x20\x20\x5f\x5f\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x5f\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x5f\x20\x5f\x5f\x0a\x20\x20\x20\x5f\x5f\x5f\x5f\x5f\x2f\x20\x2f\x5f\x20\x20\x2f\x20\x2f\x5f\x20\x20\x20\x20\x20\x20\x5f\x5f\x5f\x5f\x20\x5f\x5f\x5f\x20\x20\x28\x5f\x29\x5f\x5f\x5f\x5f\x5f\x5f\x5f\x5f\x5f\x5f\x5f\x5f\x5f\x20\x20\x5f\x5f\x5f\x5f\x5f\x28\x5f\x29\x20\x2f\x5f\x5f\x5f\x5f\x20\x20\x5f\x5f\x5f\x5f\x5f\x0a\x20\x20\x2f\x20\x5f\x5f\x5f\x2f\x20\x5f\x5f\x20\x5c\x2f\x20\x5f\x5f\x2f\x5f\x5f\x5f\x5f\x5f\x2f\x20\x5f\x5f\x20\x60\x5f\x5f\x20\x5c\x2f\x20\x2f\x20\x5f\x5f\x5f\x2f\x20\x5f\x5f\x5f\x2f\x20\x5f\x5f\x20\x5c\x2f\x20\x5f\x5f\x5f\x2f\x20\x2f\x20\x5f\x5f\x2f\x20\x5f\x20\x5c\x2f\x20\x5f\x5f\x5f\x2f\x0a\x20\x28\x5f\x5f\x20\x20\x29\x20\x2f\x5f\x2f\x20\x2f\x20\x2f\x5f\x2f\x5f\x5f\x5f\x5f\x5f\x2f\x20\x2f\x20\x2f\x20\x2f\x20\x2f\x20\x2f\x20\x2f\x20\x2f\x5f\x5f\x2f\x20\x2f\x20\x20\x2f\x20\x2f\x5f\x2f\x20\x28\x5f\x5f\x20\x20\x29\x20\x2f\x20\x2f\x5f\x2f\x20\x20\x5f\x5f\x28\x5f\x5f\x20\x20\x29\x0a\x2f\x5f\x5f\x5f\x5f\x2f\x5f\x2e\x5f\x5f\x5f\x2f\x5c\x5f\x5f\x2f\x20\x20\x20\x20\x20\x2f\x5f\x2f\x20\x2f\x5f\x2f\x20\x2f\x5f\x2f\x5f\x2f\x5c\x5f\x5f\x5f\x2f\x5f\x2f\x20\x20\x20\x5c\x5f\x5f\x5f\x5f\x2f\x5f\x5f\x5f\x5f\x2f\x5f\x2f\x5c\x5f\x5f\x2f\x5c\x5f\x5f\x5f\x2f\x5f\x5f\x5f\x5f\x2f\x0a\x0a\x68\x74\x74\x70\x73\x3a\x2f\x2f\x34\x37\x64\x65\x67\x2e\x67\x69\x74\x68\x75\x62\x2e\x69\x6f\x2f\x73\x62\x74\x2d\x6d\x69\x63\x72\x6f\x73\x69\x74\x65\x73')</script><script src="/parquet4s/js/search.js"></script><script src="/parquet4s/js/docs.js"></script></body></html> |