kiln_ai.adapters.data_gen.data_gen_task

  1import json
  2
  3from pydantic import BaseModel
  4
  5from kiln_ai.adapters.prompt_builders import SimplePromptBuilder
  6from kiln_ai.datamodel import Project, Task
  7
  8from .data_gen_prompts import (
  9    SAMPLE_GENERATION_PROMPT,
 10    TREE_GENERATION_PROMPT,
 11)
 12
 13
 14class DataGenCategoriesTaskInput(BaseModel):
 15    """Input model for generating categories/subtopics.
 16
 17    Attributes:
 18        node_path: List of strings representing the hierarchical path to current node
 19        system_prompt: System prompt to guide the AI generation
 20        num_subtopics: Number of subtopics to generate
 21        human_guidance: Optional human guidance to influence generation
 22        existing_topics: Optional list of existing topics to avoid duplication
 23    """
 24
 25    node_path: list[str]
 26    system_prompt: str
 27    num_subtopics: int
 28    human_guidance: str | None = None
 29    existing_topics: list[str] | None = None
 30
 31    @classmethod
 32    def from_task(
 33        cls,
 34        task: Task,
 35        node_path: list[str] = [],
 36        num_subtopics: int = 6,
 37        human_guidance: str | None = None,
 38        existing_topics: list[str] | None = None,
 39    ) -> "DataGenCategoriesTaskInput":
 40        """Create a DataGenCategoriesTaskInput instance from a Task.
 41
 42        Args:
 43            task: The source Task object
 44            node_path: Path to current node in topic hierarchy
 45            num_subtopics: Number of subtopics to generate
 46            human_guidance: Optional guidance for generation
 47            existing_topics: Optional list of existing topics
 48
 49        Returns:
 50            A new DataGenCategoriesTaskInput instance
 51        """
 52        prompt_builder = SimplePromptBuilder(task=task)
 53        return cls(
 54            node_path=node_path,
 55            num_subtopics=num_subtopics,
 56            human_guidance=human_guidance,
 57            existing_topics=existing_topics,
 58            system_prompt=prompt_builder.build_prompt(),
 59        )
 60
 61
 62class DataGenCategoriesTaskOutput(BaseModel):
 63    """Output model for generated categories/subtopics.
 64
 65    Attributes:
 66        subtopics: List of generated subtopic strings
 67    """
 68
 69    subtopics: list[str]
 70
 71
 72class DataGenCategoriesTask(Task, parent_of={}):
 73    """Task for generating hierarchical categories/subtopics.
 74
 75    Generates synthetic data categories which can be used to generate
 76    training data for model learning.
 77    """
 78
 79    def __init__(self):
 80        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
 81        tmp_project = Project(name="DataGen")
 82        super().__init__(
 83            name="DataGen",
 84            parent=tmp_project,
 85            description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.",
 86            instruction=TREE_GENERATION_PROMPT,
 87            input_json_schema=json.dumps(
 88                DataGenCategoriesTaskInput.model_json_schema()
 89            ),
 90            output_json_schema=json.dumps(
 91                DataGenCategoriesTaskOutput.model_json_schema()
 92            ),
 93        )
 94
 95
 96class DataGenSampleTaskInput(BaseModel):
 97    """Input model for generating data samples for a kiln task.
 98
 99    Attributes:
100        topic: List of strings representing the topic path
101        system_prompt: System prompt to guide the AI generation
102        num_samples: Number of samples to generate
103        human_guidance: Optional human guidance to influence generation
104    """
105
106    topic: list[str]
107    system_prompt: str
108    num_samples: int
109    human_guidance: str | None = None
110
111    @classmethod
112    def from_task(
113        cls,
114        task: Task,
115        topic: list[str] = [],
116        num_samples: int = 8,
117        human_guidance: str | None = None,
118    ) -> "DataGenSampleTaskInput":
119        """Create a DataGenSampleTaskInput instance from a Task.
120
121        Args:
122            task: The source Task object
123            topic: Topic path for sample generation
124            num_samples: Number of samples to generate
125            human_guidance: Optional guidance for generation
126
127        Returns:
128            A new DataGenSampleTaskInput instance
129        """
130        prompt_builder = SimplePromptBuilder(task=task)
131        return cls(
132            topic=topic,
133            num_samples=num_samples,
134            human_guidance=human_guidance,
135            system_prompt=prompt_builder.build_prompt(),
136        )
137
138
139def list_json_schema_for_task(task: Task) -> str:
140    """Generate a JSON schema for a list of task inputs (json schema)
141
142    Args:
143        task: Task object whose input schema will be used
144
145    Returns:
146        JSON string representing the schema for a list of task inputs
147    """
148    if task.input_json_schema:
149        items_schema = json.loads(task.input_json_schema)
150    else:
151        items_schema = {"type": "string"}
152
153    list_schema = {
154        "type": "array",
155        "items": items_schema,
156    }
157
158    top_level_schema = {
159        "type": "object",
160        "properties": {
161            "generated_samples": list_schema,
162        },
163        "required": ["generated_samples"],
164    }
165
166    return json.dumps(top_level_schema)
167
168
169class DataGenSampleTask(Task, parent_of={}):
170    """Task for generating data samples for a given topic.
171
172    Generates synthetic data samples based on provided topics and subtopics.
173    """
174
175    def __init__(self, target_task: Task, num_samples: int = 8):
176        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
177        tmp_project = Project(name="DataGenSample")
178        super().__init__(
179            name="DataGenSample",
180            parent=tmp_project,
181            description="A task which generates synthetic data samples for a given topic (and optional subtopic).",
182            instruction=SAMPLE_GENERATION_PROMPT,
183            input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()),
184            output_json_schema=list_json_schema_for_task(target_task),
185        )
class DataGenCategoriesTaskInput(pydantic.main.BaseModel):
15class DataGenCategoriesTaskInput(BaseModel):
16    """Input model for generating categories/subtopics.
17
18    Attributes:
19        node_path: List of strings representing the hierarchical path to current node
20        system_prompt: System prompt to guide the AI generation
21        num_subtopics: Number of subtopics to generate
22        human_guidance: Optional human guidance to influence generation
23        existing_topics: Optional list of existing topics to avoid duplication
24    """
25
26    node_path: list[str]
27    system_prompt: str
28    num_subtopics: int
29    human_guidance: str | None = None
30    existing_topics: list[str] | None = None
31
32    @classmethod
33    def from_task(
34        cls,
35        task: Task,
36        node_path: list[str] = [],
37        num_subtopics: int = 6,
38        human_guidance: str | None = None,
39        existing_topics: list[str] | None = None,
40    ) -> "DataGenCategoriesTaskInput":
41        """Create a DataGenCategoriesTaskInput instance from a Task.
42
43        Args:
44            task: The source Task object
45            node_path: Path to current node in topic hierarchy
46            num_subtopics: Number of subtopics to generate
47            human_guidance: Optional guidance for generation
48            existing_topics: Optional list of existing topics
49
50        Returns:
51            A new DataGenCategoriesTaskInput instance
52        """
53        prompt_builder = SimplePromptBuilder(task=task)
54        return cls(
55            node_path=node_path,
56            num_subtopics=num_subtopics,
57            human_guidance=human_guidance,
58            existing_topics=existing_topics,
59            system_prompt=prompt_builder.build_prompt(),
60        )

Input model for generating categories/subtopics.

Attributes: node_path: List of strings representing the hierarchical path to current node system_prompt: System prompt to guide the AI generation num_subtopics: Number of subtopics to generate human_guidance: Optional human guidance to influence generation existing_topics: Optional list of existing topics to avoid duplication

node_path: list[str]
system_prompt: str
num_subtopics: int
human_guidance: str | None
existing_topics: list[str] | None
@classmethod
def from_task( cls, task: kiln_ai.datamodel.Task, node_path: list[str] = [], num_subtopics: int = 6, human_guidance: str | None = None, existing_topics: list[str] | None = None) -> DataGenCategoriesTaskInput:
32    @classmethod
33    def from_task(
34        cls,
35        task: Task,
36        node_path: list[str] = [],
37        num_subtopics: int = 6,
38        human_guidance: str | None = None,
39        existing_topics: list[str] | None = None,
40    ) -> "DataGenCategoriesTaskInput":
41        """Create a DataGenCategoriesTaskInput instance from a Task.
42
43        Args:
44            task: The source Task object
45            node_path: Path to current node in topic hierarchy
46            num_subtopics: Number of subtopics to generate
47            human_guidance: Optional guidance for generation
48            existing_topics: Optional list of existing topics
49
50        Returns:
51            A new DataGenCategoriesTaskInput instance
52        """
53        prompt_builder = SimplePromptBuilder(task=task)
54        return cls(
55            node_path=node_path,
56            num_subtopics=num_subtopics,
57            human_guidance=human_guidance,
58            existing_topics=existing_topics,
59            system_prompt=prompt_builder.build_prompt(),
60        )

Create a DataGenCategoriesTaskInput instance from a Task.

Args: task: The source Task object node_path: Path to current node in topic hierarchy num_subtopics: Number of subtopics to generate human_guidance: Optional guidance for generation existing_topics: Optional list of existing topics

Returns: A new DataGenCategoriesTaskInput instance

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class DataGenCategoriesTaskOutput(pydantic.main.BaseModel):
63class DataGenCategoriesTaskOutput(BaseModel):
64    """Output model for generated categories/subtopics.
65
66    Attributes:
67        subtopics: List of generated subtopic strings
68    """
69
70    subtopics: list[str]

Output model for generated categories/subtopics.

Attributes: subtopics: List of generated subtopic strings

subtopics: list[str]
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class DataGenCategoriesTask(kiln_ai.datamodel.Task):
73class DataGenCategoriesTask(Task, parent_of={}):
74    """Task for generating hierarchical categories/subtopics.
75
76    Generates synthetic data categories which can be used to generate
77    training data for model learning.
78    """
79
80    def __init__(self):
81        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
82        tmp_project = Project(name="DataGen")
83        super().__init__(
84            name="DataGen",
85            parent=tmp_project,
86            description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.",
87            instruction=TREE_GENERATION_PROMPT,
88            input_json_schema=json.dumps(
89                DataGenCategoriesTaskInput.model_json_schema()
90            ),
91            output_json_schema=json.dumps(
92                DataGenCategoriesTaskOutput.model_json_schema()
93            ),
94        )

Task for generating hierarchical categories/subtopics.

Generates synthetic data categories which can be used to generate training data for model learning.

DataGenCategoriesTask()
80    def __init__(self):
81        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
82        tmp_project = Project(name="DataGen")
83        super().__init__(
84            name="DataGen",
85            parent=tmp_project,
86            description="A task which generates synthetic data categories, which in turn are used to generate training data for a model to learn from.",
87            instruction=TREE_GENERATION_PROMPT,
88            input_json_schema=json.dumps(
89                DataGenCategoriesTaskInput.model_json_schema()
90            ),
91            output_json_schema=json.dumps(
92                DataGenCategoriesTaskOutput.model_json_schema()
93            ),
94        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class DataGenSampleTaskInput(pydantic.main.BaseModel):
 97class DataGenSampleTaskInput(BaseModel):
 98    """Input model for generating data samples for a kiln task.
 99
100    Attributes:
101        topic: List of strings representing the topic path
102        system_prompt: System prompt to guide the AI generation
103        num_samples: Number of samples to generate
104        human_guidance: Optional human guidance to influence generation
105    """
106
107    topic: list[str]
108    system_prompt: str
109    num_samples: int
110    human_guidance: str | None = None
111
112    @classmethod
113    def from_task(
114        cls,
115        task: Task,
116        topic: list[str] = [],
117        num_samples: int = 8,
118        human_guidance: str | None = None,
119    ) -> "DataGenSampleTaskInput":
120        """Create a DataGenSampleTaskInput instance from a Task.
121
122        Args:
123            task: The source Task object
124            topic: Topic path for sample generation
125            num_samples: Number of samples to generate
126            human_guidance: Optional guidance for generation
127
128        Returns:
129            A new DataGenSampleTaskInput instance
130        """
131        prompt_builder = SimplePromptBuilder(task=task)
132        return cls(
133            topic=topic,
134            num_samples=num_samples,
135            human_guidance=human_guidance,
136            system_prompt=prompt_builder.build_prompt(),
137        )

Input model for generating data samples for a kiln task.

Attributes: topic: List of strings representing the topic path system_prompt: System prompt to guide the AI generation num_samples: Number of samples to generate human_guidance: Optional human guidance to influence generation

topic: list[str]
system_prompt: str
num_samples: int
human_guidance: str | None
@classmethod
def from_task( cls, task: kiln_ai.datamodel.Task, topic: list[str] = [], num_samples: int = 8, human_guidance: str | None = None) -> DataGenSampleTaskInput:
112    @classmethod
113    def from_task(
114        cls,
115        task: Task,
116        topic: list[str] = [],
117        num_samples: int = 8,
118        human_guidance: str | None = None,
119    ) -> "DataGenSampleTaskInput":
120        """Create a DataGenSampleTaskInput instance from a Task.
121
122        Args:
123            task: The source Task object
124            topic: Topic path for sample generation
125            num_samples: Number of samples to generate
126            human_guidance: Optional guidance for generation
127
128        Returns:
129            A new DataGenSampleTaskInput instance
130        """
131        prompt_builder = SimplePromptBuilder(task=task)
132        return cls(
133            topic=topic,
134            num_samples=num_samples,
135            human_guidance=human_guidance,
136            system_prompt=prompt_builder.build_prompt(),
137        )

Create a DataGenSampleTaskInput instance from a Task.

Args: task: The source Task object topic: Topic path for sample generation num_samples: Number of samples to generate human_guidance: Optional guidance for generation

Returns: A new DataGenSampleTaskInput instance

model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def list_json_schema_for_task(task: kiln_ai.datamodel.Task) -> str:
140def list_json_schema_for_task(task: Task) -> str:
141    """Generate a JSON schema for a list of task inputs (json schema)
142
143    Args:
144        task: Task object whose input schema will be used
145
146    Returns:
147        JSON string representing the schema for a list of task inputs
148    """
149    if task.input_json_schema:
150        items_schema = json.loads(task.input_json_schema)
151    else:
152        items_schema = {"type": "string"}
153
154    list_schema = {
155        "type": "array",
156        "items": items_schema,
157    }
158
159    top_level_schema = {
160        "type": "object",
161        "properties": {
162            "generated_samples": list_schema,
163        },
164        "required": ["generated_samples"],
165    }
166
167    return json.dumps(top_level_schema)

Generate a JSON schema for a list of task inputs (json schema)

Args: task: Task object whose input schema will be used

Returns: JSON string representing the schema for a list of task inputs

class DataGenSampleTask(kiln_ai.datamodel.Task):
170class DataGenSampleTask(Task, parent_of={}):
171    """Task for generating data samples for a given topic.
172
173    Generates synthetic data samples based on provided topics and subtopics.
174    """
175
176    def __init__(self, target_task: Task, num_samples: int = 8):
177        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
178        tmp_project = Project(name="DataGenSample")
179        super().__init__(
180            name="DataGenSample",
181            parent=tmp_project,
182            description="A task which generates synthetic data samples for a given topic (and optional subtopic).",
183            instruction=SAMPLE_GENERATION_PROMPT,
184            input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()),
185            output_json_schema=list_json_schema_for_task(target_task),
186        )

Task for generating data samples for a given topic.

Generates synthetic data samples based on provided topics and subtopics.

DataGenSampleTask(target_task: kiln_ai.datamodel.Task, num_samples: int = 8)
176    def __init__(self, target_task: Task, num_samples: int = 8):
177        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
178        tmp_project = Project(name="DataGenSample")
179        super().__init__(
180            name="DataGenSample",
181            parent=tmp_project,
182            description="A task which generates synthetic data samples for a given topic (and optional subtopic).",
183            instruction=SAMPLE_GENERATION_PROMPT,
184            input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()),
185            output_json_schema=list_json_schema_for_task(target_task),
186        )

Create a new model by parsing and validating input data from keyword arguments.

Raises [ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.