You've already forked RekomenciBackend
feat(): load key skills and vacancies scripts
This commit is contained in:
@@ -16,7 +16,6 @@ WORKDIR /app
|
|||||||
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PYTHONUNBUFFERED=1 \
|
PYTHONUNBUFFERED=1 \
|
||||||
PYTHONOPTIMIZE=2 \
|
|
||||||
UV_COMPILE_BYTECODE=1 \
|
UV_COMPILE_BYTECODE=1 \
|
||||||
UV_LINK_MODE=copy \
|
UV_LINK_MODE=copy \
|
||||||
UV_PROJECT_ENVIRONMENT=/opt/venv
|
UV_PROJECT_ENVIRONMENT=/opt/venv
|
||||||
@@ -48,7 +47,6 @@ RUN apt-get update && \
|
|||||||
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PYTHONUNBUFFERED=1 \
|
PYTHONUNBUFFERED=1 \
|
||||||
PYTHONOPTIMIZE=2 \
|
|
||||||
PATH="/opt/venv/bin:$PATH" \
|
PATH="/opt/venv/bin:$PATH" \
|
||||||
PYTHONPATH="/app:$PYTHONPATH"
|
PYTHONPATH="/app:$PYTHONPATH"
|
||||||
|
|
||||||
@@ -73,7 +71,6 @@ RUN apt-get update && \
|
|||||||
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PYTHONUNBUFFERED=1 \
|
PYTHONUNBUFFERED=1 \
|
||||||
PYTHONOPTIMIZE=2 \
|
|
||||||
PATH="/opt/venv/bin:$PATH" \
|
PATH="/opt/venv/bin:$PATH" \
|
||||||
PYTHONPATH="/app:$PYTHONPATH"
|
PYTHONPATH="/app:$PYTHONPATH"
|
||||||
|
|
||||||
@@ -93,7 +90,6 @@ FROM base-builder AS tests
|
|||||||
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PYTHONUNBUFFERED=1 \
|
PYTHONUNBUFFERED=1 \
|
||||||
PYTHONOPTIMIZE=2 \
|
|
||||||
PATH="/opt/venv/bin:$PATH" \
|
PATH="/opt/venv/bin:$PATH" \
|
||||||
PYTHONPATH="/app:$PYTHONPATH"
|
PYTHONPATH="/app:$PYTHONPATH"
|
||||||
|
|
||||||
@@ -115,7 +111,6 @@ FROM base-builder AS migrations
|
|||||||
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PYTHONUNBUFFERED=1 \
|
PYTHONUNBUFFERED=1 \
|
||||||
PYTHONOPTIMIZE=2 \
|
|
||||||
PATH="/opt/venv/bin:$PATH" \
|
PATH="/opt/venv/bin:$PATH" \
|
||||||
PYTHONPATH="/app:$PYTHONPATH"
|
PYTHONPATH="/app:$PYTHONPATH"
|
||||||
|
|
||||||
|
|||||||
@@ -164,6 +164,7 @@ select = [
|
|||||||
"YTT", # flake8-2020
|
"YTT", # flake8-2020
|
||||||
]
|
]
|
||||||
ignore = [
|
ignore = [
|
||||||
|
"PLR1702",
|
||||||
"A005", # allow to shadow stdlib and builtin module names
|
"A005", # allow to shadow stdlib and builtin module names
|
||||||
"COM812", # trailing comma, conflicts with `ruff format`
|
"COM812", # trailing comma, conflicts with `ruff format`
|
||||||
# Different doc rules that we don't really care about:
|
# Different doc rules that we don't really care about:
|
||||||
|
|||||||
Executable
+44
@@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from template_project.adapters.data_gateways.key_skills import KeySkillsDataGateway
|
||||||
|
from template_project.adapters.unit_of_work import DefaultUnitOfWork
|
||||||
|
from template_project.web_api.configuration import load_configuration
|
||||||
|
from template_project.web_api.ioc.make import make_ioc
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
config_path = Path("config.toml")
|
||||||
|
configuration = load_configuration(config_path)
|
||||||
|
|
||||||
|
container = make_ioc(configuration)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with container() as request_container:
|
||||||
|
data_gateway = await request_container.get(KeySkillsDataGateway)
|
||||||
|
unit_of_work = await request_container.get(DefaultUnitOfWork)
|
||||||
|
|
||||||
|
json_path = Path("full_skills_unique.json")
|
||||||
|
with json_path.open("r", encoding="utf-8") as f:
|
||||||
|
all_skills: list[str] = json.load(f)
|
||||||
|
|
||||||
|
skills_to_load = all_skills[:50000]
|
||||||
|
|
||||||
|
print(f"Загружаю {len(skills_to_load)} скиллов в БД...")
|
||||||
|
|
||||||
|
batch_size = 100
|
||||||
|
for i in range(0, len(skills_to_load), batch_size):
|
||||||
|
batch = skills_to_load[i : i + batch_size]
|
||||||
|
await data_gateway.add_skills(batch)
|
||||||
|
await unit_of_work.commit()
|
||||||
|
print(f"Загружено {min(i + batch_size, len(skills_to_load))} / {len(skills_to_load)}")
|
||||||
|
|
||||||
|
print("Готово!")
|
||||||
|
finally:
|
||||||
|
await container.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Executable
+128
@@ -0,0 +1,128 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import ast
|
||||||
|
import asyncio
|
||||||
|
import csv
|
||||||
|
from decimal import Decimal
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from template_project.adapters.unit_of_work import DefaultUnitOfWork
|
||||||
|
from template_project.application.common.embedding import Embedder
|
||||||
|
from template_project.application.common.enums import ExperienceType
|
||||||
|
from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding
|
||||||
|
from template_project.ml.configuration import load_configuration as load_ml_configuration
|
||||||
|
from template_project.ml.ioc.make import make_ioc as make_ml_ioc
|
||||||
|
from template_project.web_api.configuration import load_configuration as load_backend_configuration
|
||||||
|
from template_project.web_api.ioc.make import make_ioc as make_backend_ioc
|
||||||
|
|
||||||
|
|
||||||
|
def parse_skills(skills_str: str) -> list[str]:
|
||||||
|
try:
|
||||||
|
skills = ast.literal_eval(skills_str)
|
||||||
|
if isinstance(skills, list):
|
||||||
|
return [str(skill) for skill in skills]
|
||||||
|
return [] # noqa
|
||||||
|
except (ValueError, SyntaxError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def compose_embedding_text(position: str, description: str, key_skills: list[str]) -> str:
|
||||||
|
skills_text = ", ".join(key_skills) if key_skills else ""
|
||||||
|
parts = [position, description, skills_text]
|
||||||
|
return " ".join(filter(None, parts))
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
backend_config_path = Path("config.toml")
|
||||||
|
backend_configuration = load_backend_configuration(backend_config_path)
|
||||||
|
backend_container = make_backend_ioc(backend_configuration)
|
||||||
|
|
||||||
|
ml_config_path = Path("infrastructure/configs/ml/config.toml")
|
||||||
|
ml_configuration = load_ml_configuration(ml_config_path)
|
||||||
|
ml_container = make_ml_ioc(ml_configuration)
|
||||||
|
|
||||||
|
csv_path = Path("filtered_vacancies.csv")
|
||||||
|
max_records = 51
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with backend_container() as backend_request_container, ml_container() as ml_request_container:
|
||||||
|
unit_of_work = await backend_request_container.get(DefaultUnitOfWork)
|
||||||
|
embedder = await ml_request_container.get(Embedder)
|
||||||
|
|
||||||
|
print(f"Загружаю первые {max_records} вакансий из {csv_path}...")
|
||||||
|
|
||||||
|
with csv_path.open("r", encoding="utf-8") as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
batch_size = 50
|
||||||
|
batch = []
|
||||||
|
|
||||||
|
for idx, row in enumerate(reader):
|
||||||
|
if idx >= max_records:
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
vacancy_id_str = row.get("vacancy_id", "").strip()
|
||||||
|
if not vacancy_id_str:
|
||||||
|
continue
|
||||||
|
|
||||||
|
position = row.get("vacancy_nm", "").strip()
|
||||||
|
if not position:
|
||||||
|
continue
|
||||||
|
|
||||||
|
experience_str = row.get("experience", "").strip()
|
||||||
|
try:
|
||||||
|
experience_type = ExperienceType(experience_str)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
salary_from_str = row.get("salary_from", "").strip()
|
||||||
|
salary_to_str = row.get("salary_to", "").strip()
|
||||||
|
try:
|
||||||
|
salary_from = Decimal(salary_from_str) if salary_from_str else Decimal(0)
|
||||||
|
salary_to = Decimal(salary_to_str) if salary_to_str else Decimal(0)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
description = row.get("vacancy_description", "").strip()
|
||||||
|
key_skills = parse_skills(row.get("key_skills", "[]"))
|
||||||
|
|
||||||
|
vacancy = Vacancy.factory(
|
||||||
|
position=position,
|
||||||
|
from_salary=salary_from,
|
||||||
|
to_salary=salary_to,
|
||||||
|
experience_type=experience_type,
|
||||||
|
description=description,
|
||||||
|
key_skills=key_skills,
|
||||||
|
)
|
||||||
|
|
||||||
|
embedding_text = compose_embedding_text(position, description, key_skills)
|
||||||
|
embedding_vector = await embedder.encode(embedding_text)
|
||||||
|
|
||||||
|
embedding = VacancyEmbedding.factory(
|
||||||
|
vacancy_id=vacancy.id,
|
||||||
|
vector=embedding_vector,
|
||||||
|
)
|
||||||
|
|
||||||
|
await unit_of_work.add(vacancy, embedding)
|
||||||
|
batch.append((vacancy.id, position))
|
||||||
|
|
||||||
|
if len(batch) >= batch_size:
|
||||||
|
await unit_of_work.commit()
|
||||||
|
print(f"Загружено {len(batch)} вакансий (всего: {idx + 1})")
|
||||||
|
batch = []
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Ошибка при обработке строки {idx + 1}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
await unit_of_work.commit()
|
||||||
|
print(f"Загружено {len(batch)} вакансий (всего: {idx + 1})")
|
||||||
|
|
||||||
|
print("Готово!")
|
||||||
|
finally:
|
||||||
|
await backend_container.close()
|
||||||
|
await ml_container.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -34,6 +34,7 @@ from template_project.application.resume.entity import (
|
|||||||
)
|
)
|
||||||
from template_project.application.user.entity import User
|
from template_project.application.user.entity import User
|
||||||
from template_project.application.user.profile.entity import Profile
|
from template_project.application.user.profile.entity import Profile
|
||||||
|
from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding
|
||||||
|
|
||||||
meta_data: Final = MetaData()
|
meta_data: Final = MetaData()
|
||||||
mapper_registry: Final = registry()
|
mapper_registry: Final = registry()
|
||||||
@@ -205,6 +206,30 @@ resume_project_table: Final = Table(
|
|||||||
Column("description", String, nullable=False),
|
Column("description", String, nullable=False),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
vacancy_table: Final = Table(
|
||||||
|
"vacancy",
|
||||||
|
meta_data,
|
||||||
|
Column("id", UUID, primary_key=True),
|
||||||
|
Column("deleted_at", DateTime(timezone=True)),
|
||||||
|
Column("created_at", DateTime(timezone=True), nullable=False),
|
||||||
|
Column("position", String, nullable=False),
|
||||||
|
Column("from_salary", Numeric, nullable=False),
|
||||||
|
Column("to_salary", Numeric, nullable=False),
|
||||||
|
Column("experience_type", String, nullable=False),
|
||||||
|
Column("description", String, nullable=False),
|
||||||
|
Column("key_skills", StringArrayType(), nullable=False, server_default=text("'[]'::jsonb")),
|
||||||
|
)
|
||||||
|
|
||||||
|
vacancy_embedding_table: Final = Table(
|
||||||
|
"vacancy_embedding",
|
||||||
|
meta_data,
|
||||||
|
Column("id", UUID, primary_key=True),
|
||||||
|
Column("deleted_at", DateTime(timezone=True)),
|
||||||
|
Column("created_at", DateTime(timezone=True), nullable=False),
|
||||||
|
Column("vacancy_id", UUID, ForeignKey("vacancy.id", ondelete="CASCADE"), nullable=False),
|
||||||
|
Column("vector", Vector, nullable=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
mapper_registry.map_imperatively(User, user_table)
|
mapper_registry.map_imperatively(User, user_table)
|
||||||
mapper_registry.map_imperatively(AccessToken, access_token_table)
|
mapper_registry.map_imperatively(AccessToken, access_token_table)
|
||||||
@@ -229,3 +254,11 @@ mapper_registry.map_imperatively(
|
|||||||
mapper_registry.map_imperatively(ResumeExperience, resume_experience_table)
|
mapper_registry.map_imperatively(ResumeExperience, resume_experience_table)
|
||||||
mapper_registry.map_imperatively(ResumeEducation, resume_education_table)
|
mapper_registry.map_imperatively(ResumeEducation, resume_education_table)
|
||||||
mapper_registry.map_imperatively(ResumeProject, resume_project_table)
|
mapper_registry.map_imperatively(ResumeProject, resume_project_table)
|
||||||
|
mapper_registry.map_imperatively(
|
||||||
|
Vacancy,
|
||||||
|
vacancy_table,
|
||||||
|
properties={
|
||||||
|
"key_skills": vacancy_table.c.key_skills,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
mapper_registry.map_imperatively(VacancyEmbedding, vacancy_embedding_table)
|
||||||
|
|||||||
@@ -1,12 +1,19 @@
|
|||||||
|
from datetime import UTC, datetime
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from typing import Any
|
from typing import NewType, Self
|
||||||
|
from uuid import UUID
|
||||||
|
|
||||||
|
from uuid_utils.compat import uuid7
|
||||||
|
|
||||||
from template_project.application.common.entity import Entity, to_entity
|
from template_project.application.common.entity import Entity, to_entity
|
||||||
from template_project.application.common.enums import ExperienceType
|
from template_project.application.common.enums import ExperienceType
|
||||||
|
|
||||||
|
VacancyId = NewType("VacancyId", UUID)
|
||||||
|
VacancyEmbeddingId = NewType("VacancyEmbeddingId", UUID)
|
||||||
|
|
||||||
|
|
||||||
@to_entity
|
@to_entity
|
||||||
class Vacancy(Entity[Any]):
|
class Vacancy(Entity[VacancyId]):
|
||||||
position: str
|
position: str
|
||||||
from_salary: Decimal
|
from_salary: Decimal
|
||||||
to_salary: Decimal
|
to_salary: Decimal
|
||||||
@@ -14,8 +21,42 @@ class Vacancy(Entity[Any]):
|
|||||||
description: str
|
description: str
|
||||||
key_skills: list[str]
|
key_skills: list[str]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def factory(
|
||||||
|
cls,
|
||||||
|
position: str,
|
||||||
|
from_salary: Decimal,
|
||||||
|
to_salary: Decimal,
|
||||||
|
experience_type: ExperienceType,
|
||||||
|
description: str,
|
||||||
|
key_skills: list[str],
|
||||||
|
) -> Self:
|
||||||
|
return cls(
|
||||||
|
id=VacancyId(uuid7()),
|
||||||
|
created_at=datetime.now(tz=UTC),
|
||||||
|
position=position,
|
||||||
|
from_salary=from_salary,
|
||||||
|
to_salary=to_salary,
|
||||||
|
experience_type=experience_type,
|
||||||
|
description=description,
|
||||||
|
key_skills=key_skills,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@to_entity
|
@to_entity
|
||||||
class VacancyEmbedding(Entity[Any]):
|
class VacancyEmbedding(Entity[VacancyEmbeddingId]):
|
||||||
vacancy_id: Any
|
vacancy_id: VacancyId
|
||||||
vector: list[float]
|
vector: list[float]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def factory(
|
||||||
|
cls,
|
||||||
|
vacancy_id: VacancyId,
|
||||||
|
vector: list[float],
|
||||||
|
) -> Self:
|
||||||
|
return cls(
|
||||||
|
id=VacancyEmbeddingId(uuid7()),
|
||||||
|
created_at=datetime.now(tz=UTC),
|
||||||
|
vacancy_id=vacancy_id,
|
||||||
|
vector=vector,
|
||||||
|
)
|
||||||
|
|||||||
@@ -0,0 +1,58 @@
|
|||||||
|
"""empty message
|
||||||
|
|
||||||
|
Revision ID: 9a32674539dd
|
||||||
|
Revises: 892aba57b356
|
||||||
|
Create Date: 2025-11-23 01:26:29.515334
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
import pgvector.sqlalchemy
|
||||||
|
from sqlalchemy import Text
|
||||||
|
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
||||||
|
import template_project.adapters.data_gateways.tables
|
||||||
|
from template_project.adapters.data_gateways.tables import StringArrayType
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = '9a32674539dd'
|
||||||
|
down_revision: Union[str, Sequence[str], None] = '892aba57b356'
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
"""Upgrade schema."""
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
op.create_table('vacancy',
|
||||||
|
sa.Column('id', sa.UUID(), nullable=False),
|
||||||
|
sa.Column('deleted_at', sa.DateTime(timezone=True), nullable=True),
|
||||||
|
sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
|
||||||
|
sa.Column('position', sa.String(), nullable=False),
|
||||||
|
sa.Column('from_salary', sa.Numeric(), nullable=False),
|
||||||
|
sa.Column('to_salary', sa.Numeric(), nullable=False),
|
||||||
|
sa.Column('experience_type', sa.String(), nullable=False),
|
||||||
|
sa.Column('description', sa.String(), nullable=False),
|
||||||
|
sa.Column('key_skills', template_project.adapters.data_gateways.tables.StringArrayType(astext_type=Text()), server_default=sa.text("'[]'::jsonb"), nullable=False),
|
||||||
|
sa.PrimaryKeyConstraint('id')
|
||||||
|
)
|
||||||
|
op.create_table('vacancy_embedding',
|
||||||
|
sa.Column('id', sa.UUID(), nullable=False),
|
||||||
|
sa.Column('deleted_at', sa.DateTime(timezone=True), nullable=True),
|
||||||
|
sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
|
||||||
|
sa.Column('vacancy_id', sa.UUID(), nullable=False),
|
||||||
|
sa.Column('vector', pgvector.sqlalchemy.vector.VECTOR(), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(['vacancy_id'], ['vacancy.id'], ondelete='CASCADE'),
|
||||||
|
sa.PrimaryKeyConstraint('id')
|
||||||
|
)
|
||||||
|
# ### end Alembic commands ###
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
"""Downgrade schema."""
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
op.drop_table('vacancy_embedding')
|
||||||
|
op.drop_table('vacancy')
|
||||||
|
# ### end Alembic commands ###
|
||||||
Reference in New Issue
Block a user