diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index d27d1554..d400de38 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -2,34 +2,23 @@
from pathlib import Path
import os
-# Third Party
-from openai import OpenAI
-
# First Party
from instructlab.sdg.utils.chunkers import DocumentChunker
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")
-openai_api_key = os.environ.get("OPENAI_API_KEY", "EMPTY")
-openai_api_base = os.environ.get("OPENAI_API_BASE", "http://localhost:8000/v1")
-
# TODO: Apparently we don't really need any contents in the qna.yaml?
-knowledge_pdf_qna = """
+knowledge_qna = """
version: 3
domain: astronomy
"""
def test_chunk_pdf(tmp_path):
- client = OpenAI(
- api_key=openai_api_key,
- base_url=openai_api_base,
- )
-
qna_dir = os.path.join(tmp_path, "knowledge")
os.makedirs(qna_dir)
with open(os.path.join(qna_dir, "qna.yaml"), "w", encoding="utf-8") as f:
- f.write(knowledge_pdf_qna)
+ f.write(knowledge_qna)
leaf_node = [
{
@@ -47,5 +36,37 @@ def test_chunk_pdf(tmp_path):
tokenizer_model_name="instructlab/merlinite-7b-lab",
)
chunks = chunker.chunk_documents()
- assert len(chunks) > 1
+ assert len(chunks) > 9
assert "Phoenix is a minor constellation" in chunks[0]
+ for chunk in chunks:
+ # inexact sanity-checking of chunk max length
+ assert len(chunk) < 2500
+
+
+def test_chunk_md(tmp_path):
+ qna_dir = os.path.join(tmp_path, "knowledge")
+ os.makedirs(qna_dir)
+ with open(os.path.join(qna_dir, "qna.yaml"), "w", encoding="utf-8") as f:
+ f.write(knowledge_qna)
+
+ markdown_path = Path(os.path.join(TEST_DATA_DIR, "phoenix.md"))
+ leaf_node = [
+ {
+ "documents": [markdown_path.read_text(encoding="utf-8")],
+ "filepaths": [markdown_path],
+ "taxonomy_path": "knowledge",
+ }
+ ]
+ chunker = DocumentChunker(
+ leaf_node=leaf_node,
+ taxonomy_path=tmp_path,
+ output_dir=tmp_path,
+ server_ctx_size=4096,
+ chunk_word_count=500,
+ tokenizer_model_name="instructlab/merlinite-7b-lab",
+ )
+ chunks = chunker.chunk_documents()
+ assert len(chunks) > 7
+ for chunk in chunks:
+ # inexact sanity-checking of chunk max length
+ assert len(chunk) < 2500
diff --git a/tests/functional/testdata/phoenix.md b/tests/functional/testdata/phoenix.md
new file mode 100644
index 00000000..8ed90aea
--- /dev/null
+++ b/tests/functional/testdata/phoenix.md
@@ -0,0 +1,284 @@
+# Phoenix (constellation)
+
+**Phoenix** is a minor [constellation](constellation "wikilink") in the
+[southern sky](southern_sky "wikilink"). Named after the mythical
+[phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a
+celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603
+*[Uranometria](Uranometria "wikilink")*. The French explorer and
+astronomer [Nicolas Louis de
+Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter
+stars and gave their [Bayer designations](Bayer_designation "wikilink")
+in 1756. The constellation stretches from roughly −39 degrees to −57 degrees
+[declination](declination "wikilink"), and from 23.5h to 2.5h of [right
+ascension](right_ascension "wikilink"). The constellations Phoenix,
+[Grus](Grus_(constellation) "wikilink"),
+[Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"),
+are known as the Southern Birds.
+
+The brightest star, [Alpha Phoenicis](Alpha_Phoenicis "wikilink"), is
+named Ankaa, an [Arabic](Arabic "wikilink") word meaning 'the Phoenix'.
+It is an orange giant of apparent magnitude 2.4. Next is [Beta
+Phoenicis](Beta_Phoenicis "wikilink"), actually a
+[binary](Binary_star "wikilink") system composed of two yellow giants
+with a combined apparent magnitude of 3.3. [Nu
+Phoenicis](Nu_Phoenicis "wikilink") has a dust disk, while the
+constellation has ten star systems with known planets and the recently
+discovered [galaxy clusters](galaxy_cluster "wikilink") [El
+Gordo](El_Gordo_(galaxy_cluster) "wikilink") and the [Phoenix
+Cluster](Phoenix_Cluster "wikilink")—located 7.2 and 5.7 billion light
+years away respectively, two of the largest objects in the [visible
+universe](visible_universe "wikilink"). Phoenix is the
+[radiant](radiant_(meteor_shower) "wikilink") of two annual [meteor
+showers](meteor_shower "wikilink"): the
+[Phoenicids](Phoenicids "wikilink") in December, and the July
+Phoenicids.
+
+## History
+
+Phoenix was the largest of the 12 constellations established by [Petrus
+Plancius](Petrus_Plancius "wikilink") from the observations of [Pieter
+Dirkszoon Keyser](Pieter_Dirkszoon_Keyser "wikilink") and [Frederick de
+Houtman](Frederick_de_Houtman "wikilink"). It first appeared on a 35cm
+diameter celestial globe published in 1597 (or 1598) in Amsterdam by
+Plancius with [Jodocus Hondius](Jodocus_Hondius "wikilink"). The first
+depiction of this constellation in a celestial atlas was in [Johann
+Bayer](Johann_Bayer "wikilink")'s
+*[Uranometria](Uranometria "wikilink")* of 1603. De Houtman included
+it in his southern star catalog the same year under the Dutch name *Den
+voghel Fenicx*, "The Bird Phoenix", symbolising the
+[phoenix](Phoenix_(mythology) "wikilink") of classical mythology. One
+name of the brightest star [Alpha
+Phoenicis](Alpha_Phoenicis "wikilink")—Ankaa—is derived from the Arabic:
+العنقاء, romanized: al-‘anqā’, lit. 'the phoenix', and
+was coined sometime after 1800 in relation to the constellation.
+
+Celestial historian Richard Allen noted that unlike the other
+constellations introduced by Plancius and [La
+Caille](La_Caille "wikilink"), Phoenix has actual precedent in ancient
+astronomy, as the Arabs saw this formation as representing young
+ostriches, *Al Ri'āl*, or as a griffin or eagle. In addition, the
+same group of stars was sometimes imagined by the Arabs as a boat, *Al
+Zaurak*, on the nearby river Eridanus. He observed, "the introduction
+of a Phoenix into modern astronomy was, in a measure, by adoption rather
+than by invention."
+
+The Chinese incorporated Phoenix's brightest star, Ankaa (Alpha
+Phoenicis), and stars from the adjacent constellation
+[Sculptor](Sculptor_(constellation) "wikilink") to depict *Bakui*, a net
+for catching birds. Phoenix and the neighbouring constellation of
+[Grus](Grus_(constellation) "wikilink") together were seen by [Julius
+Schiller](Julius_Schiller "wikilink") as portraying
+[Aaron](Aaron "wikilink") the High Priest. These two constellations,
+along with nearby [Pavo](Pavo_(constellation) "wikilink") and
+[Tucana](Tucana "wikilink"), are called the Southern Birds.
+
+## Characteristics
+
+Phoenix is a small constellation bordered by [Fornax](Fornax "wikilink")
+and Sculptor to the north, Grus to the west, Tucana to the south,
+touching on the corner of [Hydrus](Hydrus "wikilink") to the south, and
+[Eridanus](Eridanus_(constellation) "wikilink") to the east and
+southeast. The bright star [Achernar](Achernar "wikilink") is
+nearby. The three-letter abbreviation for the constellation, as
+adopted by the [International Astronomical
+Union](International_Astronomical_Union "wikilink") in 1922, is
+"Phe". The official constellation boundaries, as set by Belgian
+astronomer [Eugène Delporte](Eugène_Joseph_Delporte "wikilink") in 1930,
+are defined by a polygon of 10 segments. In the [equatorial coordinate
+system](equatorial_coordinate_system "wikilink"), the [right
+ascension](right_ascension "wikilink") coordinates of these borders lie
+between 23h 26.5m and 02h 25.0m,
+while the [declination](declination "wikilink")
+coordinates are between −39.31° and −57.84°. This means it remains
+below the horizon to anyone living north of the [40th
+parallel](40th_parallel_north "wikilink") in the [Northern
+Hemisphere](Northern_Hemisphere "wikilink"), and remains low in the sky
+for anyone living north of the [equator](equator "wikilink"). It is most
+visible from locations such as Australia and South Africa during late
+[Southern Hemisphere](Southern_Hemisphere "wikilink") spring. Most
+of the constellation lies within, and can be located by, forming a
+triangle of the bright stars Achernar, [Fomalhaut](Fomalhaut "wikilink")
+and [Beta Ceti](Beta_Ceti "wikilink")—Ankaa lies roughly in the centre
+of this.
+
+## Features
+
+### Stars
+
+A curved line of stars comprising Alpha,
+[Kappa](Kappa_Phoenicis "wikilink"), [Mu](Mu_Phoenicis "wikilink"),
+[Beta](Beta_Phoenicis "wikilink"), [Nu](Nu_Phoenicis "wikilink") and
+[Gamma Phoenicis](Gamma_Phoenicis "wikilink") was seen as a boat by the
+ancient Arabs. French explorer and astronomer [Nicolas Louis de
+Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted and designated
+27 stars with the [Bayer designations](Bayer_designation "wikilink")
+Alpha through to Omega in 1756. Of these, he labelled two stars close
+together Lambda, and assigned Omicron, Psi and Omega to three stars,
+which subsequent astronomers such as [Benjamin
+Gould](Benjamin_Apthorp_Gould "wikilink") felt were too dim to warrant
+their letters. A different star was subsequently labelled Psi Phoenicis,
+while the other two designations fell out of use.
+
+Ankaa is the brightest star in the constellation. It is an orange giant
+of [apparent visual magnitude](apparent_visual_magnitude "wikilink")
+2.37 and [spectral type](Stellar_classification "wikilink")
+K0.5IIIb, 77 light years distant from Earth and orbited by a
+secondary object about which little is known. Lying close by Ankaa
+is [Kappa Phoenicis](Kappa_Phoenicis "wikilink"), a [main
+sequence](main_sequence "wikilink") star of spectral type A5IVn and
+apparent magnitude 3.90. Located centrally in the asterism,
+[Beta Phoenicis](Beta_Phoenicis "wikilink") is the second brightest star
+in the constellation and another [binary star](binary_star "wikilink").
+Together the stars, both yellow giants of spectral type G8, shine with
+an apparent magnitude of 3.31, though the components are of individual
+apparent magnitudes of 4.0 and 4.1 and orbit each other every 168
+years. [Zeta Phoenicis](Zeta_Phoenicis "wikilink") or *Wurren*
+is an [Algol](Algol_variable "wikilink")-type [eclipsing
+binary](Binary_star#Eclipsing_binaries "wikilink"), with an [apparent
+magnitude](apparent_magnitude "wikilink") fluctuating between 3.9 and
+4.4 with a period of around 1.7 days (40 hours); its dimming results
+from the component two blue-white B-type stars, which orbit and block
+out each other from Earth. The two stars are 0.05 AU from each other,
+while a third star is around 600 AU away from the pair, and has an
+orbital period exceeding 5000 years. The system is around 300 light
+years distant. In 1976, researchers Clausen, Gyldenkerne, and
+Grønbech calculated that a nearby 8th magnitude star is a fourth member
+of the system.
+
+AI Phe is an eclipsing binary star identified in 1972. Its long mutual
+eclipses and combination of spectroscopic and astrometric data allows
+precise measurement of the masses and radii of the stars which is
+viewed as a potential cross-check on stellar properties and distances
+independent on Ceiphid Variables and such techniques. The long eclipse
+events require space-based observations to avoid Solar interference.
+Gamma Phoenicis is a [red giant](red_giant "wikilink") of spectral type
+M0IIIa and varies between magnitudes 3.39 and 3.49. It lies 235
+light years away. [Psi Phoenicis](Psi_Phoenicis "wikilink") is
+another red giant, this time of spectral type M4III, and has an
+apparent magnitude that ranges between 4.3 and 4.5 over a period of
+around 30 days. Lying 340 light years away, it has around 85
+times the diameter, but only 85% of the mass, of the Sun. [W
+Phoenicis](W_Phoenicis "wikilink") is a [Mira
+variable](Mira_variable "wikilink"), ranging from magnitude 8.1 to 14.4
+over 333.95 days. A red giant, its spectrum ranges between M5e and
+M6e. Located 6.5 degrees west of Ankaa is [SX
+Phoenicis](SX_Phoenicis "wikilink"), a variable star which ranges from
+magnitude 7.1 to 7.5 over a period of a mere 79 minutes. Its spectral
+type varies between A2 and F4. It gives its name to a group of stars
+known as [SX Phoenicis variables](SX_Phoenicis_variable "wikilink").
+[Rho](Rho_Phoenicis "wikilink") and [BD
+Phoenicis](BD_Phoenicis "wikilink") are [Delta Scuti
+variables](Delta_Scuti_variable "wikilink")—short period (six hours at
+most) pulsating stars that have been used as [standard
+candles](Cosmic_distance_ladder#Standard_candles "wikilink") and as
+subjects to study [astroseismology](astroseismology "wikilink"). Rho
+is spectral type F2III, and ranges between magnitudes 5.20 and 5.26
+over a period of 2.85 hours. BD is of spectral type A1V, and
+ranges between magnitudes 5.90 and 5.94.
+
+[Nu Phoenicis](Nu_Phoenicis "wikilink") is a yellow-white main sequence
+star of spectral type F9V and magnitude 4.96. Lying some 49 light
+years distant, it is around 1.2 times as massive as the Sun, and
+likely to be surrounded by a disk of dust. It is the closest star in
+the constellation that is visible with the unaided eye. [Gliese
+915](Gliese_915 "wikilink") is a [white dwarf](white_dwarf "wikilink")
+only 26 light years away. It is of magnitude 13.05, too faint to be seen
+with the naked eye. White dwarfs are extremely dense stars compacted
+into a volume the size of the Earth. With around 85% of the mass of
+the Sun, Gliese 915 has a [surface gravity](surface_gravity "wikilink")
+of 108.39 ± 0.01 (2.45 · 108)
+[cm](centimetre "wikilink")·[s](second "wikilink")−2, or
+approximately 250,000 of [Earth's](Earth's_gravity "wikilink").
+
+Ten stars have been found to have planets to date, and four planetary
+systems have been discovered with the [SuperWASP](SuperWASP "wikilink")
+project. [HD 142](HD_142 "wikilink") is a yellow giant that has an
+apparent magnitude of 5.7, and has a planet ([HD 142
+b](HD_142_b "wikilink")) 1.36 times the mass of Jupiter which orbits
+every 328 days. [HD 2039](HD_2039 "wikilink") is a yellow subgiant
+with an apparent magnitude of 9.0 around 330 light years away which has
+a planet ([HD 2039 b](HD_2039_b "wikilink")) six times the mass of
+Jupiter. [WASP-18](WASP-18 "wikilink") is a star of magnitude 9.29 which
+was discovered to have a hot Jupiter-like planet
+([WASP-18b](WASP-18b "wikilink")) taking less than a day to orbit the
+star. The planet is suspected to be causing WASP-18 to appear older
+than it really is. [WASP-4](WASP-4 "wikilink") and
+[WASP-5](WASP-5 "wikilink") are solar-type yellow stars around 1000
+light years distant and of 13th magnitude, each with a single planet
+larger than Jupiter. [WASP-29](WASP-29 "wikilink") is an orange
+dwarf of spectral type K4V and visual magnitude 11.3, which has a
+planetary companion of similar size and mass to Saturn. The planet
+completes an orbit every 3.9 days.
+
+[WISE J003231.09-494651.4](List_of_brown_dwarfs "wikilink") and [WISE
+J001505.87-461517.6](List_of_brown_dwarfs "wikilink") are two [brown
+dwarfs](brown_dwarf "wikilink") discovered by the [Wide-field Infrared
+Survey Explorer](Wide-field_Infrared_Survey_Explorer "wikilink"), and
+are 63 and 49 light years away respectively. Initially hypothesised
+before they were belatedly discovered, brown dwarfs are objects more
+massive than planets, but which are of insufficient mass for [hydrogen
+fusion](Nuclear_fusion "wikilink") characteristic of stars to occur.
+Many are being found by sky surveys.
+
+Phoenix contains [HE0107-5240](HE0107-5240 "wikilink"), possibly one of
+the oldest stars yet discovered. It has around 1/200,000 the
+[metallicity](metallicity "wikilink") that the Sun has and hence must
+have formed very early in the history of the universe. With a visual
+magnitude of 15.17, it is around 10,000 times dimmer than the
+faintest stars visible to the naked eye and is 36,000 light years
+distant.
+
+### Deep-sky objects
+
+The constellation does not lie on the [galactic
+plane](galactic_plane "wikilink") of the Milky Way, and there are no
+prominent star clusters. [NGC 625](NGC_625 "wikilink") is a dwarf
+[irregular galaxy](irregular_galaxy "wikilink") of apparent magnitude
+11.0 and lying some 12.7 million light years distant. Only 24000 light
+years in diameter, it is an outlying member of the [Sculptor
+Group](Sculptor_Group "wikilink"). NGC 625 is thought to have been
+involved in a collision and is experiencing a burst of [active star
+formation](Active_galactic_nucleus "wikilink"). [NGC
+37](NGC_37 "wikilink") is a [lenticular
+galaxy](lenticular_galaxy "wikilink") of apparent magnitude 14.66. It is
+approximately 42 [kiloparsecs](kiloparsecs "wikilink") (137,000
+[light-years](light-years "wikilink")) in diameter and about 12.9
+billion years old. [Robert's Quartet](Robert's_Quartet "wikilink")
+(composed of the irregular galaxy [NGC 87](NGC_87 "wikilink"), and three
+spiral galaxies [NGC 88](NGC_88 "wikilink"), [NGC 89](NGC_89 "wikilink")
+and [NGC 92](NGC_92 "wikilink")) is a group of four galaxies located
+around 160 million light-years away which are in the process of
+colliding and merging. They are within a circle of radius of 1.6 arcmin,
+corresponding to about 75,000 light-years. Located in the galaxy ESO
+243-49 is [HLX-1](HLX-1 "wikilink"), an [intermediate-mass black
+hole](intermediate-mass_black_hole "wikilink")—the first one of its kind
+identified. It is thought to be a remnant of a dwarf galaxy that was
+absorbed in a [collision](Interacting_galaxy "wikilink") with ESO
+243-49. Before its discovery, this class of black hole was only
+hypothesized.
+
+Lying within the bounds of the constellation is the gigantic [Phoenix
+cluster](Phoenix_cluster "wikilink"), which is around 7.3 million light
+years wide and 5.7 billion light years away, making it one of the most
+massive [galaxy clusters](galaxy_cluster "wikilink"). It was first
+discovered in 2010, and the central galaxy is producing an estimated 740
+new stars a year. Larger still is [El
+Gordo](El_Gordo_(galaxy_cluster) "wikilink"), or officially ACT-CL
+J0102-4915, whose discovery was announced in 2012. Located around
+7.2 billion light years away, it is composed of two subclusters in the
+process of colliding, resulting in the spewing out of hot gas, seen in
+X-rays and infrared images.
+
+### Meteor showers
+
+Phoenix is the [radiant](radiant_(meteor_shower) "wikilink") of two
+annual [meteor showers](meteor_shower "wikilink"). The
+[Phoenicids](Phoenicids "wikilink"), also known as the December
+Phoenicids, were first observed on 3 December 1887. The shower was
+particularly intense in December 1956, and is thought related to the
+breakup of the [short-period comet](short-period_comet "wikilink")
+[289P/Blanpain](289P/Blanpain "wikilink"). It peaks around 4–5 December,
+though is not seen every year. A very minor meteor shower peaks
+around July 14 with around one meteor an hour, though meteors can be
+seen anytime from July 3 to 18; this shower is referred to as the July
+Phoenicids.