diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index d27d1554..d400de38 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -2,34 +2,23 @@ from pathlib import Path import os -# Third Party -from openai import OpenAI - # First Party from instructlab.sdg.utils.chunkers import DocumentChunker TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") -openai_api_key = os.environ.get("OPENAI_API_KEY", "EMPTY") -openai_api_base = os.environ.get("OPENAI_API_BASE", "http://localhost:8000/v1") - # TODO: Apparently we don't really need any contents in the qna.yaml? -knowledge_pdf_qna = """ +knowledge_qna = """ version: 3 domain: astronomy """ def test_chunk_pdf(tmp_path): - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - qna_dir = os.path.join(tmp_path, "knowledge") os.makedirs(qna_dir) with open(os.path.join(qna_dir, "qna.yaml"), "w", encoding="utf-8") as f: - f.write(knowledge_pdf_qna) + f.write(knowledge_qna) leaf_node = [ { @@ -47,5 +36,37 @@ def test_chunk_pdf(tmp_path): tokenizer_model_name="instructlab/merlinite-7b-lab", ) chunks = chunker.chunk_documents() - assert len(chunks) > 1 + assert len(chunks) > 9 assert "Phoenix is a minor constellation" in chunks[0] + for chunk in chunks: + # inexact sanity-checking of chunk max length + assert len(chunk) < 2500 + + +def test_chunk_md(tmp_path): + qna_dir = os.path.join(tmp_path, "knowledge") + os.makedirs(qna_dir) + with open(os.path.join(qna_dir, "qna.yaml"), "w", encoding="utf-8") as f: + f.write(knowledge_qna) + + markdown_path = Path(os.path.join(TEST_DATA_DIR, "phoenix.md")) + leaf_node = [ + { + "documents": [markdown_path.read_text(encoding="utf-8")], + "filepaths": [markdown_path], + "taxonomy_path": "knowledge", + } + ] + chunker = DocumentChunker( + leaf_node=leaf_node, + taxonomy_path=tmp_path, + output_dir=tmp_path, + server_ctx_size=4096, + chunk_word_count=500, + tokenizer_model_name="instructlab/merlinite-7b-lab", + ) + chunks = chunker.chunk_documents() + assert len(chunks) > 7 + for chunk in chunks: + # inexact sanity-checking of chunk max length + assert len(chunk) < 2500 diff --git a/tests/functional/testdata/phoenix.md b/tests/functional/testdata/phoenix.md new file mode 100644 index 00000000..8ed90aea --- /dev/null +++ b/tests/functional/testdata/phoenix.md @@ -0,0 +1,284 @@ +# Phoenix (constellation) + +**Phoenix** is a minor [constellation](constellation "wikilink") in the +[southern sky](southern_sky "wikilink"). Named after the mythical +[phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a +celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603 +*[Uranometria](Uranometria "wikilink")*. The French explorer and +astronomer [Nicolas Louis de +Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter +stars and gave their [Bayer designations](Bayer_designation "wikilink") +in 1756. The constellation stretches from roughly −39 degrees to −57 degrees +[declination](declination "wikilink"), and from 23.5h to 2.5h of [right +ascension](right_ascension "wikilink"). The constellations Phoenix, +[Grus](Grus_(constellation) "wikilink"), +[Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"), +are known as the Southern Birds. + +The brightest star, [Alpha Phoenicis](Alpha_Phoenicis "wikilink"), is +named Ankaa, an [Arabic](Arabic "wikilink") word meaning 'the Phoenix'. +It is an orange giant of apparent magnitude 2.4. Next is [Beta +Phoenicis](Beta_Phoenicis "wikilink"), actually a +[binary](Binary_star "wikilink") system composed of two yellow giants +with a combined apparent magnitude of 3.3. [Nu +Phoenicis](Nu_Phoenicis "wikilink") has a dust disk, while the +constellation has ten star systems with known planets and the recently +discovered [galaxy clusters](galaxy_cluster "wikilink") [El +Gordo](El_Gordo_(galaxy_cluster) "wikilink") and the [Phoenix +Cluster](Phoenix_Cluster "wikilink")—located 7.2 and 5.7 billion light +years away respectively, two of the largest objects in the [visible +universe](visible_universe "wikilink"). Phoenix is the +[radiant](radiant_(meteor_shower) "wikilink") of two annual [meteor +showers](meteor_shower "wikilink"): the +[Phoenicids](Phoenicids "wikilink") in December, and the July +Phoenicids. + +## History + +Phoenix was the largest of the 12 constellations established by [Petrus +Plancius](Petrus_Plancius "wikilink") from the observations of [Pieter +Dirkszoon Keyser](Pieter_Dirkszoon_Keyser "wikilink") and [Frederick de +Houtman](Frederick_de_Houtman "wikilink"). It first appeared on a 35cm +diameter celestial globe published in 1597 (or 1598) in Amsterdam by +Plancius with [Jodocus Hondius](Jodocus_Hondius "wikilink"). The first +depiction of this constellation in a celestial atlas was in [Johann +Bayer](Johann_Bayer "wikilink")'s +*[Uranometria](Uranometria "wikilink")* of 1603. De Houtman included +it in his southern star catalog the same year under the Dutch name *Den +voghel Fenicx*, "The Bird Phoenix", symbolising the +[phoenix](Phoenix_(mythology) "wikilink") of classical mythology. One +name of the brightest star [Alpha +Phoenicis](Alpha_Phoenicis "wikilink")—Ankaa—is derived from the Arabic: +العنقاء, romanized: al-‘anqā’, lit. 'the phoenix', and +was coined sometime after 1800 in relation to the constellation. + +Celestial historian Richard Allen noted that unlike the other +constellations introduced by Plancius and [La +Caille](La_Caille "wikilink"), Phoenix has actual precedent in ancient +astronomy, as the Arabs saw this formation as representing young +ostriches, *Al Ri'āl*, or as a griffin or eagle. In addition, the +same group of stars was sometimes imagined by the Arabs as a boat, *Al +Zaurak*, on the nearby river Eridanus. He observed, "the introduction +of a Phoenix into modern astronomy was, in a measure, by adoption rather +than by invention." + +The Chinese incorporated Phoenix's brightest star, Ankaa (Alpha +Phoenicis), and stars from the adjacent constellation +[Sculptor](Sculptor_(constellation) "wikilink") to depict *Bakui*, a net +for catching birds. Phoenix and the neighbouring constellation of +[Grus](Grus_(constellation) "wikilink") together were seen by [Julius +Schiller](Julius_Schiller "wikilink") as portraying +[Aaron](Aaron "wikilink") the High Priest. These two constellations, +along with nearby [Pavo](Pavo_(constellation) "wikilink") and +[Tucana](Tucana "wikilink"), are called the Southern Birds. + +## Characteristics + +Phoenix is a small constellation bordered by [Fornax](Fornax "wikilink") +and Sculptor to the north, Grus to the west, Tucana to the south, +touching on the corner of [Hydrus](Hydrus "wikilink") to the south, and +[Eridanus](Eridanus_(constellation) "wikilink") to the east and +southeast. The bright star [Achernar](Achernar "wikilink") is +nearby. The three-letter abbreviation for the constellation, as +adopted by the [International Astronomical +Union](International_Astronomical_Union "wikilink") in 1922, is +"Phe". The official constellation boundaries, as set by Belgian +astronomer [Eugène Delporte](Eugène_Joseph_Delporte "wikilink") in 1930, +are defined by a polygon of 10 segments. In the [equatorial coordinate +system](equatorial_coordinate_system "wikilink"), the [right +ascension](right_ascension "wikilink") coordinates of these borders lie +between 23h 26.5m and 02h 25.0m, +while the [declination](declination "wikilink") +coordinates are between −39.31° and −57.84°. This means it remains +below the horizon to anyone living north of the [40th +parallel](40th_parallel_north "wikilink") in the [Northern +Hemisphere](Northern_Hemisphere "wikilink"), and remains low in the sky +for anyone living north of the [equator](equator "wikilink"). It is most +visible from locations such as Australia and South Africa during late +[Southern Hemisphere](Southern_Hemisphere "wikilink") spring. Most +of the constellation lies within, and can be located by, forming a +triangle of the bright stars Achernar, [Fomalhaut](Fomalhaut "wikilink") +and [Beta Ceti](Beta_Ceti "wikilink")—Ankaa lies roughly in the centre +of this. + +## Features + +### Stars + +A curved line of stars comprising Alpha, +[Kappa](Kappa_Phoenicis "wikilink"), [Mu](Mu_Phoenicis "wikilink"), +[Beta](Beta_Phoenicis "wikilink"), [Nu](Nu_Phoenicis "wikilink") and +[Gamma Phoenicis](Gamma_Phoenicis "wikilink") was seen as a boat by the +ancient Arabs. French explorer and astronomer [Nicolas Louis de +Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted and designated +27 stars with the [Bayer designations](Bayer_designation "wikilink") +Alpha through to Omega in 1756. Of these, he labelled two stars close +together Lambda, and assigned Omicron, Psi and Omega to three stars, +which subsequent astronomers such as [Benjamin +Gould](Benjamin_Apthorp_Gould "wikilink") felt were too dim to warrant +their letters. A different star was subsequently labelled Psi Phoenicis, +while the other two designations fell out of use. + +Ankaa is the brightest star in the constellation. It is an orange giant +of [apparent visual magnitude](apparent_visual_magnitude "wikilink") +2.37 and [spectral type](Stellar_classification "wikilink") +K0.5IIIb, 77 light years distant from Earth and orbited by a +secondary object about which little is known. Lying close by Ankaa +is [Kappa Phoenicis](Kappa_Phoenicis "wikilink"), a [main +sequence](main_sequence "wikilink") star of spectral type A5IVn and +apparent magnitude 3.90. Located centrally in the asterism, +[Beta Phoenicis](Beta_Phoenicis "wikilink") is the second brightest star +in the constellation and another [binary star](binary_star "wikilink"). +Together the stars, both yellow giants of spectral type G8, shine with +an apparent magnitude of 3.31, though the components are of individual +apparent magnitudes of 4.0 and 4.1 and orbit each other every 168 +years. [Zeta Phoenicis](Zeta_Phoenicis "wikilink") or *Wurren* +is an [Algol](Algol_variable "wikilink")-type [eclipsing +binary](Binary_star#Eclipsing_binaries "wikilink"), with an [apparent +magnitude](apparent_magnitude "wikilink") fluctuating between 3.9 and +4.4 with a period of around 1.7 days (40 hours); its dimming results +from the component two blue-white B-type stars, which orbit and block +out each other from Earth. The two stars are 0.05 AU from each other, +while a third star is around 600 AU away from the pair, and has an +orbital period exceeding 5000 years. The system is around 300 light +years distant. In 1976, researchers Clausen, Gyldenkerne, and +Grønbech calculated that a nearby 8th magnitude star is a fourth member +of the system. + +AI Phe is an eclipsing binary star identified in 1972. Its long mutual +eclipses and combination of spectroscopic and astrometric data allows +precise measurement of the masses and radii of the stars which is +viewed as a potential cross-check on stellar properties and distances +independent on Ceiphid Variables and such techniques. The long eclipse +events require space-based observations to avoid Solar interference. +Gamma Phoenicis is a [red giant](red_giant "wikilink") of spectral type +M0IIIa and varies between magnitudes 3.39 and 3.49. It lies 235 +light years away. [Psi Phoenicis](Psi_Phoenicis "wikilink") is +another red giant, this time of spectral type M4III, and has an +apparent magnitude that ranges between 4.3 and 4.5 over a period of +around 30 days. Lying 340 light years away, it has around 85 +times the diameter, but only 85% of the mass, of the Sun. [W +Phoenicis](W_Phoenicis "wikilink") is a [Mira +variable](Mira_variable "wikilink"), ranging from magnitude 8.1 to 14.4 +over 333.95 days. A red giant, its spectrum ranges between M5e and +M6e. Located 6.5 degrees west of Ankaa is [SX +Phoenicis](SX_Phoenicis "wikilink"), a variable star which ranges from +magnitude 7.1 to 7.5 over a period of a mere 79 minutes. Its spectral +type varies between A2 and F4. It gives its name to a group of stars +known as [SX Phoenicis variables](SX_Phoenicis_variable "wikilink"). +[Rho](Rho_Phoenicis "wikilink") and [BD +Phoenicis](BD_Phoenicis "wikilink") are [Delta Scuti +variables](Delta_Scuti_variable "wikilink")—short period (six hours at +most) pulsating stars that have been used as [standard +candles](Cosmic_distance_ladder#Standard_candles "wikilink") and as +subjects to study [astroseismology](astroseismology "wikilink"). Rho +is spectral type F2III, and ranges between magnitudes 5.20 and 5.26 +over a period of 2.85 hours. BD is of spectral type A1V, and +ranges between magnitudes 5.90 and 5.94. + +[Nu Phoenicis](Nu_Phoenicis "wikilink") is a yellow-white main sequence +star of spectral type F9V and magnitude 4.96. Lying some 49 light +years distant, it is around 1.2 times as massive as the Sun, and +likely to be surrounded by a disk of dust. It is the closest star in +the constellation that is visible with the unaided eye. [Gliese +915](Gliese_915 "wikilink") is a [white dwarf](white_dwarf "wikilink") +only 26 light years away. It is of magnitude 13.05, too faint to be seen +with the naked eye. White dwarfs are extremely dense stars compacted +into a volume the size of the Earth. With around 85% of the mass of +the Sun, Gliese 915 has a [surface gravity](surface_gravity "wikilink") +of 108.39 ± 0.01 (2.45 · 108) +[cm](centimetre "wikilink")·[s](second "wikilink")−2, or +approximately 250,000 of [Earth's](Earth's_gravity "wikilink"). + +Ten stars have been found to have planets to date, and four planetary +systems have been discovered with the [SuperWASP](SuperWASP "wikilink") +project. [HD 142](HD_142 "wikilink") is a yellow giant that has an +apparent magnitude of 5.7, and has a planet ([HD 142 +b](HD_142_b "wikilink")) 1.36 times the mass of Jupiter which orbits +every 328 days. [HD 2039](HD_2039 "wikilink") is a yellow subgiant +with an apparent magnitude of 9.0 around 330 light years away which has +a planet ([HD 2039 b](HD_2039_b "wikilink")) six times the mass of +Jupiter. [WASP-18](WASP-18 "wikilink") is a star of magnitude 9.29 which +was discovered to have a hot Jupiter-like planet +([WASP-18b](WASP-18b "wikilink")) taking less than a day to orbit the +star. The planet is suspected to be causing WASP-18 to appear older +than it really is. [WASP-4](WASP-4 "wikilink") and +[WASP-5](WASP-5 "wikilink") are solar-type yellow stars around 1000 +light years distant and of 13th magnitude, each with a single planet +larger than Jupiter. [WASP-29](WASP-29 "wikilink") is an orange +dwarf of spectral type K4V and visual magnitude 11.3, which has a +planetary companion of similar size and mass to Saturn. The planet +completes an orbit every 3.9 days. + +[WISE J003231.09-494651.4](List_of_brown_dwarfs "wikilink") and [WISE +J001505.87-461517.6](List_of_brown_dwarfs "wikilink") are two [brown +dwarfs](brown_dwarf "wikilink") discovered by the [Wide-field Infrared +Survey Explorer](Wide-field_Infrared_Survey_Explorer "wikilink"), and +are 63 and 49 light years away respectively. Initially hypothesised +before they were belatedly discovered, brown dwarfs are objects more +massive than planets, but which are of insufficient mass for [hydrogen +fusion](Nuclear_fusion "wikilink") characteristic of stars to occur. +Many are being found by sky surveys. + +Phoenix contains [HE0107-5240](HE0107-5240 "wikilink"), possibly one of +the oldest stars yet discovered. It has around 1/200,000 the +[metallicity](metallicity "wikilink") that the Sun has and hence must +have formed very early in the history of the universe. With a visual +magnitude of 15.17, it is around 10,000 times dimmer than the +faintest stars visible to the naked eye and is 36,000 light years +distant. + +### Deep-sky objects + +The constellation does not lie on the [galactic +plane](galactic_plane "wikilink") of the Milky Way, and there are no +prominent star clusters. [NGC 625](NGC_625 "wikilink") is a dwarf +[irregular galaxy](irregular_galaxy "wikilink") of apparent magnitude +11.0 and lying some 12.7 million light years distant. Only 24000 light +years in diameter, it is an outlying member of the [Sculptor +Group](Sculptor_Group "wikilink"). NGC 625 is thought to have been +involved in a collision and is experiencing a burst of [active star +formation](Active_galactic_nucleus "wikilink"). [NGC +37](NGC_37 "wikilink") is a [lenticular +galaxy](lenticular_galaxy "wikilink") of apparent magnitude 14.66. It is +approximately 42 [kiloparsecs](kiloparsecs "wikilink") (137,000 +[light-years](light-years "wikilink")) in diameter and about 12.9 +billion years old. [Robert's Quartet](Robert's_Quartet "wikilink") +(composed of the irregular galaxy [NGC 87](NGC_87 "wikilink"), and three +spiral galaxies [NGC 88](NGC_88 "wikilink"), [NGC 89](NGC_89 "wikilink") +and [NGC 92](NGC_92 "wikilink")) is a group of four galaxies located +around 160 million light-years away which are in the process of +colliding and merging. They are within a circle of radius of 1.6 arcmin, +corresponding to about 75,000 light-years. Located in the galaxy ESO +243-49 is [HLX-1](HLX-1 "wikilink"), an [intermediate-mass black +hole](intermediate-mass_black_hole "wikilink")—the first one of its kind +identified. It is thought to be a remnant of a dwarf galaxy that was +absorbed in a [collision](Interacting_galaxy "wikilink") with ESO +243-49. Before its discovery, this class of black hole was only +hypothesized. + +Lying within the bounds of the constellation is the gigantic [Phoenix +cluster](Phoenix_cluster "wikilink"), which is around 7.3 million light +years wide and 5.7 billion light years away, making it one of the most +massive [galaxy clusters](galaxy_cluster "wikilink"). It was first +discovered in 2010, and the central galaxy is producing an estimated 740 +new stars a year. Larger still is [El +Gordo](El_Gordo_(galaxy_cluster) "wikilink"), or officially ACT-CL +J0102-4915, whose discovery was announced in 2012. Located around +7.2 billion light years away, it is composed of two subclusters in the +process of colliding, resulting in the spewing out of hot gas, seen in +X-rays and infrared images. + +### Meteor showers + +Phoenix is the [radiant](radiant_(meteor_shower) "wikilink") of two +annual [meteor showers](meteor_shower "wikilink"). The +[Phoenicids](Phoenicids "wikilink"), also known as the December +Phoenicids, were first observed on 3 December 1887. The shower was +particularly intense in December 1956, and is thought related to the +breakup of the [short-period comet](short-period_comet "wikilink") +[289P/Blanpain](289P/Blanpain "wikilink"). It peaks around 4–5 December, +though is not seen every year. A very minor meteor shower peaks +around July 14 with around one meteor an hour, though meteors can be +seen anytime from July 3 to 18; this shower is referred to as the July +Phoenicids.