Skip to content

Commit

Permalink
v1 Parsing for paragraphs & tx fieldsets (#132)
Browse files Browse the repository at this point in the history
* implement api v1 for paragraphs and text inputs

* api endpoint

* checkboxes and radio groups

* Remove unused import

* Use domain function generatePatternId for random IDs.

---------

Co-authored-by: Daniel Naab <[email protected]>
  • Loading branch information
jimmoffet and danielnaab authored May 21, 2024
1 parent c273122 commit 09a42bb
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 97 deletions.
5 changes: 5 additions & 0 deletions packages/forms/src/components.ts
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,11 @@ export const createPromptForPattern: CreatePrompt<Pattern> = (
options
) => {
const patternConfig = getPatternConfig(config, pattern.type);
if (!patternConfig) {
throw new Error(
`Pattern config not found for pattern type ${pattern.type} with id ${pattern.id} and config ${JSON.stringify(config, null, 2)}`
);
}
return patternConfig.createPrompt(config, session, pattern, options);
};

Expand Down
269 changes: 174 additions & 95 deletions packages/forms/src/documents/pdf/parsing-api.ts
Original file line number Diff line number Diff line change
@@ -1,89 +1,122 @@
import * as z from 'zod';

import { type PatternId, type PatternMap } from '../..';
import { generatePatternId, type PatternId, type PatternMap } from '../..';

import { type FieldsetPattern } from '../../patterns/fieldset';
import { type InputPattern } from '../../patterns/input';
import { type ParagraphPattern } from '../../patterns/paragraph';
import { type SequencePattern } from '../../patterns/sequence';
import { type CheckboxPattern } from '../../patterns/checkbox';
import { type RadioGroupPattern } from '../../patterns/radio-group';

import { stringToBase64, uint8ArrayToBase64 } from '../util';
import { uint8ArrayToBase64 } from '../util';
import { type DocumentFieldMap } from '../types';

/** API v1 response format
* // formSummary json
* {
* "component_type": "form_summary",
* "title": "", // The title of the form.
* "description": "" // A brief description of the form.
* }
*
* // TxInput json
* {
* "component_type": "text_input",
* "id": "", // A unique identifier for the text input.
* "label": "", // The label text for the text input.
* "default_value": "", // The default value of the text input.
* "required": true // Whether the text input is required.
* }
*
* // checkbox json
* {
* "component_type": "checkbox",
* "id": "", // A unique identifier for the checkbox.
* "label": "", // The label text for the checkbox.
* "default_checked": false // Whether the checkbox is checked by default.
* }
*
* // radioGroup json
* {
* "component_type": "radio_group",
* "legend": "", // The legend for the radio group.
* "options": [
* {
* "id": "", // A unique identifier for each option.
* "label": "", // The label text for the option.
* "name": "", // The name shared by all options in the group.
* "default_checked": false // Whether the option is checked by default.
* }
* ]
* }
*
* // paragraph json
* {
* "component_type": "paragraph",
* "text": "" // The text content of the paragraph.
* }
*
* // fieldset json
* {
* "component_type": "fieldset",
* "legend": "", // The legend for the field set.
* "fields": [] // An array of elements, can include text inputs and checkboxes.
* }
*/

const FormSummary = z.object({
component_type: z.literal('form_summary'),
title: z.string(),
description: z.string(),
});

const TxInput = z.object({
input_type: z.literal('Tx'),
input_params: z.object({
text: z.string(),
text_style: z.string(),
output_id: z.string(),
placeholder: z.string(),
instructions: z.string(),
required: z.boolean(),
options: z.array(z.string()),
}),
component_type: z.literal('text_input'),
id: z.string(),
label: z.string(),
default_value: z.string(),
required: z.boolean(),
});

const BtnInput = z.object({
input_type: z.literal('Btn'),
input_params: z.object({
text: z.string(),
text_style: z.string(),
output_id: z.string(),
placeholder: z.string(),
instructions: z.string(),
required: z.boolean(),
options: z.array(z.string()),
}),
const Checkbox = z.object({
component_type: z.literal('checkbox'),
id: z.string(),
label: z.string(),
default_checked: z.boolean(),
});

const ExtractedInput = z.discriminatedUnion('input_type', [TxInput, BtnInput]);
type ExtractedInput = z.infer<typeof ExtractedInput>;
const RadioGroupOption = z.object({
id: z.string(),
label: z.string(),
name: z.string(),
default_checked: z.boolean(),
});

const ExtractedElement = z.object({
const RadioGroup = z.object({
id: z.string(),
group_id: z.number(),
element_type: z.string(),
element_params: z.object({
text: z.string(),
text_style: z.string(),
options: z.string().array().nullable(),
}),
inputs: ExtractedInput.array(),
parent: z.string().nullable(),
component_type: z.literal('radio_group'),
legend: z.string(),
options: RadioGroupOption.array(),
});
type ExtractedElement = z.infer<typeof ExtractedElement>;

const RawTxField = z.object({
type: z.literal('/Tx'),
var_name: z.string(),
field_dict: z.object({
font_info: z.string(),
field_type: z.string(),
coordinates: z.number().array().optional(),
field_label: z.string(),
field_instructions: z.string(),
}),

const Paragraph = z.object({
component_type: z.literal('paragraph'),
text: z.string(),
});

const RawBtnField = z.object({
type: z.literal('/Btn'),
var_name: z.string(),
field_dict: z.object({
font_info: z.string(),
flags: z.unknown().optional(),
field_type: z.string(),
field_label: z.string(),
child_fields: z.array(z.object({ coordinates: z.number().array() })),
num_children: z.number(),
}),
const Fieldset = z.object({
component_type: z.literal('fieldset'),
legend: z.string(),
fields: z.union([TxInput, Checkbox]).array(),
});

const ExtractedObject = z.object({
raw_text: z.string(),
title: z.string(),
description: z.string(),
elements: ExtractedElement.array(),
raw_fields: z.discriminatedUnion('type', [RawTxField, RawBtnField]).array(),
form_summary: FormSummary,
elements: z
.union([TxInput, Checkbox, RadioGroup, Paragraph, Fieldset])
.array(),
});

type ExtractedObject = z.infer<typeof ExtractedObject>;
Expand All @@ -97,7 +130,7 @@ export type ParsedPdf = {

export const callExternalParser = async (
rawData: Uint8Array,
endpointUrl: string = 'https://10x-atj-doc-automation-staging.app.cloud.gov/api/parse'
endpointUrl: string = 'https://10x-atj-doc-automation-staging.app.cloud.gov/api/v1/parse'
): Promise<ParsedPdf> => {
const base64 = await uint8ArrayToBase64(rawData);

Expand All @@ -122,65 +155,111 @@ export const callExternalParser = async (
patterns: {},
outputs: {},
root: 'root',
title: extracted.title,
title: extracted.form_summary.title,
};

const rootSequence: PatternId[] = [];

for (const element of extracted.elements) {
const randomId = generatePatternId();
const fieldsetPatterns: PatternId[] = [];
if (element.inputs.length === 0) {
parsedPdf.patterns[element.id] = {

// Add paragraph elements
if (element.component_type === 'paragraph') {
parsedPdf.patterns[randomId] = {
type: 'paragraph',
id: element.id,
id: randomId,
data: {
text: element.element_params.text,
text: element.text,
},
} satisfies ParagraphPattern;
rootSequence.push(element.id);
rootSequence.push(randomId);
continue;
}
for (const input of element.inputs) {
if (input.input_type === 'Tx') {
const id = stringToBase64(input.input_params.output_id);
parsedPdf.patterns[id] = {
type: 'input',
id,
data: {
label: input.input_params.instructions,
required: false,
initial: '',
maxLength: 128,
},
} satisfies InputPattern;
fieldsetPatterns.push(id);
parsedPdf.outputs[id] = {
type: 'TextField',
name: input.input_params.output_id,
label: input.input_params.instructions,
value: '',
maxLength: 1024,
required: input.input_params.required,
};
}

if (element.component_type === 'checkbox') {
parsedPdf.patterns[element.id] = {
type: 'checkbox',
id: element.id,
data: {
label: element.label,
defaultChecked: element.default_checked,
},
} satisfies CheckboxPattern;
rootSequence.push(element.id);
continue;
}
if (fieldsetPatterns.length > 0) {

if (element.component_type === 'radio_group') {
parsedPdf.patterns[element.id] = {
type: 'radio-group',
id: element.id,
data: {
label: element.legend,
options: element.options.map(option => ({
id: option.id,
label: option.label,
name: option.name,
defaultChecked: option.default_checked,
})),
},
} satisfies RadioGroupPattern;
rootSequence.push(element.id);
continue;
}

if (element.component_type === 'fieldset') {
for (const input of element.fields) {
if (input.component_type === 'text_input') {
// const id = stringToBase64(input.id);

parsedPdf.patterns[input.id] = {
type: 'input',
id: input.id,
data: {
label: input.label,
required: false,
initial: '',
maxLength: 128,
},
} satisfies InputPattern;

fieldsetPatterns.push(input.id);

parsedPdf.outputs[input.id] = {
type: 'TextField',
name: input.id,
label: input.label,
value: '',
maxLength: 1024,
required: input.required,
};
}
// TODO: Look for checkbox or other element types
}
}

// Add fieldset to parsedPdf.patterns and rootSequence
if (element.component_type === 'fieldset' && fieldsetPatterns.length > 0) {
parsedPdf.patterns[randomId] = {
id: randomId,
type: 'fieldset',
data: {
legend: element.element_params.text,
legend: element.legend,
patterns: fieldsetPatterns,
},
} satisfies FieldsetPattern;
rootSequence.push(element.id);
rootSequence.push(randomId);
}
}

parsedPdf.patterns['root'] = {
id: 'root',
type: 'sequence',
data: {
patterns: rootSequence,
},
} satisfies SequencePattern;

return parsedPdf;
};
2 changes: 1 addition & 1 deletion packages/forms/src/pattern.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ export const updatePatternFromFormData = (
};
};

const generatePatternId = () => crypto.randomUUID();
export const generatePatternId = () => crypto.randomUUID();

export const createPattern = (
config: FormConfig,
Expand Down
2 changes: 1 addition & 1 deletion packages/forms/src/patterns/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ export const defaultFormConfig: FormConfig = {
fieldset: fieldsetConfig,
input: inputConfig,
paragraph: paragraphConfig,
radioGroup: radioGroupConfig,
'radio-group': radioGroupConfig,
sequence: sequenceConfig,
},
};

0 comments on commit 09a42bb

Please sign in to comment.