diff --git a/Containerfile b/Containerfile index 71cba6b..e4bbb5f 100644 --- a/Containerfile +++ b/Containerfile @@ -16,7 +16,6 @@ WORKDIR /app ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ - PYTHONOPTIMIZE=2 \ UV_COMPILE_BYTECODE=1 \ UV_LINK_MODE=copy \ UV_PROJECT_ENVIRONMENT=/opt/venv @@ -48,7 +47,6 @@ RUN apt-get update && \ ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ - PYTHONOPTIMIZE=2 \ PATH="/opt/venv/bin:$PATH" \ PYTHONPATH="/app:$PYTHONPATH" @@ -73,7 +71,6 @@ RUN apt-get update && \ ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ - PYTHONOPTIMIZE=2 \ PATH="/opt/venv/bin:$PATH" \ PYTHONPATH="/app:$PYTHONPATH" @@ -93,7 +90,6 @@ FROM base-builder AS tests ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ - PYTHONOPTIMIZE=2 \ PATH="/opt/venv/bin:$PATH" \ PYTHONPATH="/app:$PYTHONPATH" @@ -115,7 +111,6 @@ FROM base-builder AS migrations ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ - PYTHONOPTIMIZE=2 \ PATH="/opt/venv/bin:$PATH" \ PYTHONPATH="/app:$PYTHONPATH" diff --git a/pyproject.toml b/pyproject.toml index bb8ccf7..c176305 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -164,6 +164,7 @@ select = [ "YTT", # flake8-2020 ] ignore = [ + "PLR1702", "A005", # allow to shadow stdlib and builtin module names "COM812", # trailing comma, conflicts with `ruff format` # Different doc rules that we don't really care about: diff --git a/src/dataset/load_skills.py b/src/dataset/load_skills.py new file mode 100755 index 0000000..1a3395e --- /dev/null +++ b/src/dataset/load_skills.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import asyncio +import json +from pathlib import Path + +from template_project.adapters.data_gateways.key_skills import KeySkillsDataGateway +from template_project.adapters.unit_of_work import DefaultUnitOfWork +from template_project.web_api.configuration import load_configuration +from template_project.web_api.ioc.make import make_ioc + + +async def main() -> None: + config_path = Path("config.toml") + configuration = load_configuration(config_path) + + container = make_ioc(configuration) + + try: + async with container() as request_container: + data_gateway = await request_container.get(KeySkillsDataGateway) + unit_of_work = await request_container.get(DefaultUnitOfWork) + + json_path = Path("full_skills_unique.json") + with json_path.open("r", encoding="utf-8") as f: + all_skills: list[str] = json.load(f) + + skills_to_load = all_skills[:50000] + + print(f"Загружаю {len(skills_to_load)} скиллов в БД...") + + batch_size = 100 + for i in range(0, len(skills_to_load), batch_size): + batch = skills_to_load[i : i + batch_size] + await data_gateway.add_skills(batch) + await unit_of_work.commit() + print(f"Загружено {min(i + batch_size, len(skills_to_load))} / {len(skills_to_load)}") + + print("Готово!") + finally: + await container.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/dataset/load_vacancies.py b/src/dataset/load_vacancies.py new file mode 100755 index 0000000..9c9779d --- /dev/null +++ b/src/dataset/load_vacancies.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +import ast +import asyncio +import csv +from decimal import Decimal +from pathlib import Path + +from template_project.adapters.unit_of_work import DefaultUnitOfWork +from template_project.application.common.embedding import Embedder +from template_project.application.common.enums import ExperienceType +from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding +from template_project.ml.configuration import load_configuration as load_ml_configuration +from template_project.ml.ioc.make import make_ioc as make_ml_ioc +from template_project.web_api.configuration import load_configuration as load_backend_configuration +from template_project.web_api.ioc.make import make_ioc as make_backend_ioc + + +def parse_skills(skills_str: str) -> list[str]: + try: + skills = ast.literal_eval(skills_str) + if isinstance(skills, list): + return [str(skill) for skill in skills] + return [] # noqa + except (ValueError, SyntaxError): + return [] + + +def compose_embedding_text(position: str, description: str, key_skills: list[str]) -> str: + skills_text = ", ".join(key_skills) if key_skills else "" + parts = [position, description, skills_text] + return " ".join(filter(None, parts)) + + +async def main() -> None: + backend_config_path = Path("config.toml") + backend_configuration = load_backend_configuration(backend_config_path) + backend_container = make_backend_ioc(backend_configuration) + + ml_config_path = Path("infrastructure/configs/ml/config.toml") + ml_configuration = load_ml_configuration(ml_config_path) + ml_container = make_ml_ioc(ml_configuration) + + csv_path = Path("filtered_vacancies.csv") + max_records = 51 + + try: + async with backend_container() as backend_request_container, ml_container() as ml_request_container: + unit_of_work = await backend_request_container.get(DefaultUnitOfWork) + embedder = await ml_request_container.get(Embedder) + + print(f"Загружаю первые {max_records} вакансий из {csv_path}...") + + with csv_path.open("r", encoding="utf-8") as f: + reader = csv.DictReader(f) + batch_size = 50 + batch = [] + + for idx, row in enumerate(reader): + if idx >= max_records: + break + + try: + vacancy_id_str = row.get("vacancy_id", "").strip() + if not vacancy_id_str: + continue + + position = row.get("vacancy_nm", "").strip() + if not position: + continue + + experience_str = row.get("experience", "").strip() + try: + experience_type = ExperienceType(experience_str) + except ValueError: + continue + + salary_from_str = row.get("salary_from", "").strip() + salary_to_str = row.get("salary_to", "").strip() + try: + salary_from = Decimal(salary_from_str) if salary_from_str else Decimal(0) + salary_to = Decimal(salary_to_str) if salary_to_str else Decimal(0) + except (ValueError, TypeError): + continue + + description = row.get("vacancy_description", "").strip() + key_skills = parse_skills(row.get("key_skills", "[]")) + + vacancy = Vacancy.factory( + position=position, + from_salary=salary_from, + to_salary=salary_to, + experience_type=experience_type, + description=description, + key_skills=key_skills, + ) + + embedding_text = compose_embedding_text(position, description, key_skills) + embedding_vector = await embedder.encode(embedding_text) + + embedding = VacancyEmbedding.factory( + vacancy_id=vacancy.id, + vector=embedding_vector, + ) + + await unit_of_work.add(vacancy, embedding) + batch.append((vacancy.id, position)) + + if len(batch) >= batch_size: + await unit_of_work.commit() + print(f"Загружено {len(batch)} вакансий (всего: {idx + 1})") + batch = [] + + except Exception as e: + print(f"Ошибка при обработке строки {idx + 1}: {e}") + continue + + if batch: + await unit_of_work.commit() + print(f"Загружено {len(batch)} вакансий (всего: {idx + 1})") + + print("Готово!") + finally: + await backend_container.close() + await ml_container.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/template_project/adapters/data_gateways/tables.py b/src/template_project/adapters/data_gateways/tables.py index c6d4cdd..58e70e7 100644 --- a/src/template_project/adapters/data_gateways/tables.py +++ b/src/template_project/adapters/data_gateways/tables.py @@ -34,6 +34,7 @@ from template_project.application.resume.entity import ( ) from template_project.application.user.entity import User from template_project.application.user.profile.entity import Profile +from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding meta_data: Final = MetaData() mapper_registry: Final = registry() @@ -205,6 +206,30 @@ resume_project_table: Final = Table( Column("description", String, nullable=False), ) +vacancy_table: Final = Table( + "vacancy", + meta_data, + Column("id", UUID, primary_key=True), + Column("deleted_at", DateTime(timezone=True)), + Column("created_at", DateTime(timezone=True), nullable=False), + Column("position", String, nullable=False), + Column("from_salary", Numeric, nullable=False), + Column("to_salary", Numeric, nullable=False), + Column("experience_type", String, nullable=False), + Column("description", String, nullable=False), + Column("key_skills", StringArrayType(), nullable=False, server_default=text("'[]'::jsonb")), +) + +vacancy_embedding_table: Final = Table( + "vacancy_embedding", + meta_data, + Column("id", UUID, primary_key=True), + Column("deleted_at", DateTime(timezone=True)), + Column("created_at", DateTime(timezone=True), nullable=False), + Column("vacancy_id", UUID, ForeignKey("vacancy.id", ondelete="CASCADE"), nullable=False), + Column("vector", Vector, nullable=False), +) + mapper_registry.map_imperatively(User, user_table) mapper_registry.map_imperatively(AccessToken, access_token_table) @@ -229,3 +254,11 @@ mapper_registry.map_imperatively( mapper_registry.map_imperatively(ResumeExperience, resume_experience_table) mapper_registry.map_imperatively(ResumeEducation, resume_education_table) mapper_registry.map_imperatively(ResumeProject, resume_project_table) +mapper_registry.map_imperatively( + Vacancy, + vacancy_table, + properties={ + "key_skills": vacancy_table.c.key_skills, + }, +) +mapper_registry.map_imperatively(VacancyEmbedding, vacancy_embedding_table) diff --git a/src/template_project/application/vacancy/entity.py b/src/template_project/application/vacancy/entity.py index 8991505..1815dc7 100644 --- a/src/template_project/application/vacancy/entity.py +++ b/src/template_project/application/vacancy/entity.py @@ -1,12 +1,19 @@ +from datetime import UTC, datetime from decimal import Decimal -from typing import Any +from typing import NewType, Self +from uuid import UUID + +from uuid_utils.compat import uuid7 from template_project.application.common.entity import Entity, to_entity from template_project.application.common.enums import ExperienceType +VacancyId = NewType("VacancyId", UUID) +VacancyEmbeddingId = NewType("VacancyEmbeddingId", UUID) + @to_entity -class Vacancy(Entity[Any]): +class Vacancy(Entity[VacancyId]): position: str from_salary: Decimal to_salary: Decimal @@ -14,8 +21,42 @@ class Vacancy(Entity[Any]): description: str key_skills: list[str] + @classmethod + def factory( + cls, + position: str, + from_salary: Decimal, + to_salary: Decimal, + experience_type: ExperienceType, + description: str, + key_skills: list[str], + ) -> Self: + return cls( + id=VacancyId(uuid7()), + created_at=datetime.now(tz=UTC), + position=position, + from_salary=from_salary, + to_salary=to_salary, + experience_type=experience_type, + description=description, + key_skills=key_skills, + ) + @to_entity -class VacancyEmbedding(Entity[Any]): - vacancy_id: Any +class VacancyEmbedding(Entity[VacancyEmbeddingId]): + vacancy_id: VacancyId vector: list[float] + + @classmethod + def factory( + cls, + vacancy_id: VacancyId, + vector: list[float], + ) -> Self: + return cls( + id=VacancyEmbeddingId(uuid7()), + created_at=datetime.now(tz=UTC), + vacancy_id=vacancy_id, + vector=vector, + ) diff --git a/src/template_project/migrations/versions/9a32674539dd_.py b/src/template_project/migrations/versions/9a32674539dd_.py new file mode 100644 index 0000000..6cac7c0 --- /dev/null +++ b/src/template_project/migrations/versions/9a32674539dd_.py @@ -0,0 +1,58 @@ +"""empty message + +Revision ID: 9a32674539dd +Revises: 892aba57b356 +Create Date: 2025-11-23 01:26:29.515334 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import pgvector.sqlalchemy +from sqlalchemy import Text +from sqlalchemy.dialects.postgresql import JSONB, UUID +import template_project.adapters.data_gateways.tables +from template_project.adapters.data_gateways.tables import StringArrayType + + +# revision identifiers, used by Alembic. +revision: str = '9a32674539dd' +down_revision: Union[str, Sequence[str], None] = '892aba57b356' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('vacancy', + sa.Column('id', sa.UUID(), nullable=False), + sa.Column('deleted_at', sa.DateTime(timezone=True), nullable=True), + sa.Column('created_at', sa.DateTime(timezone=True), nullable=False), + sa.Column('position', sa.String(), nullable=False), + sa.Column('from_salary', sa.Numeric(), nullable=False), + sa.Column('to_salary', sa.Numeric(), nullable=False), + sa.Column('experience_type', sa.String(), nullable=False), + sa.Column('description', sa.String(), nullable=False), + sa.Column('key_skills', template_project.adapters.data_gateways.tables.StringArrayType(astext_type=Text()), server_default=sa.text("'[]'::jsonb"), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('vacancy_embedding', + sa.Column('id', sa.UUID(), nullable=False), + sa.Column('deleted_at', sa.DateTime(timezone=True), nullable=True), + sa.Column('created_at', sa.DateTime(timezone=True), nullable=False), + sa.Column('vacancy_id', sa.UUID(), nullable=False), + sa.Column('vector', pgvector.sqlalchemy.vector.VECTOR(), nullable=False), + sa.ForeignKeyConstraint(['vacancy_id'], ['vacancy.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('vacancy_embedding') + op.drop_table('vacancy') + # ### end Alembic commands ###