не помнб

This commit is contained in:
ivankirpichnikov
2025-11-22 18:15:29 +03:00
parent 995141a200
commit effbcfbc2d
17 changed files with 210 additions and 54 deletions
+39 -20
View File
@@ -1,30 +1,49 @@
import json
import logging
import urllib.parse
from collections.abc import Sequence
from pathlib import Path
from typing import Final
from adaptix import DebugTrail, NameStyle, Retort, name_mapping
from dataset.data_structures import DataSetLine, Salary
from dataset.upload_key_skills import upload_key_kills
retort = Retort(
recipe=[
name_mapping(Salary, name_style=NameStyle.CAMEL),
],
debug_trail=DebugTrail.DISABLE,
strict_coercion=False,
)
DATASET_PATH: Final = Path("hh_ru_vacancies.jsonlines")
BASE_URL: Final = "https://team-39-alpha-gm5qjkou.hack.prodcontest.ru"
UPLOAD_KEY_SKILLS: Final = urllib.parse.urljoin(BASE_URL, "key_skills")
raw_lines = []
with Path("hh_ru_vacancies.jsonlines").open("r", encoding="utf-8") as f:
raw_lines = map(json.loads, f.readlines())
logger = logging.getLogger(__name__)
lines = retort.load(raw_lines, list[DataSetLine])
f = set()
c = 0
for line in lines:
if c == 1000:
break
if line.experience:
f.add(line.experience)
c += 0
print(f)
def parse_dataset(file_path: Path) -> Sequence[DataSetLine]:
retort = Retort(
recipe=[
name_mapping(Salary, name_style=NameStyle.CAMEL),
],
debug_trail=DebugTrail.DISABLE,
strict_coercion=False,
)
raw_lines = []
with file_path.open("r", encoding="utf-8") as f:
raw_lines = map(json.loads, f.readlines())
return retort.load(raw_lines, Sequence[DataSetLine])
def main() -> None:
logging.basicConfig(level=logging.INFO)
logger.info("Parsing dataset...")
dataset = parse_dataset(DATASET_PATH)
upload_key_kills(dataset, UPLOAD_KEY_SKILLS)
logger.info("finished script")
if __name__ == "__main__":
main()
+37
View File
@@ -0,0 +1,37 @@
import logging
from collections.abc import Sequence
from requests import Session
from dataset.data_structures import DataSetLine
logger = logging.getLogger(__name__)
def upload_key_kills(
dataset: Sequence[DataSetLine],
upload_endpoint: str,
max_upload_count: int | None = None
) -> None:
session = Session()
key_skills = []
for count, line in enumerate(dataset):
if max_upload_count is not None and count >= max_upload_count:
break
key_skills.extend(line.key_skills)
logger.info("Upload skills %r", key_skills)
response = session.post(
upload_endpoint,
json={
"key_skills": key_skills
}
)
if response.status_code != 200:
logger.warning("Doesn't upload skills. Status code %r", response.status_code)
else:
logger.info("Upload skills %r. Status code %r", key_skills, response.status_code)
key_skills = []
@@ -15,7 +15,7 @@ class KeySkillsDataGateway:
async def query(self, query: str) -> Sequence[str]:
statement = (
select(key_skills_table.c.name)
.where(key_skills_table.c.name.ilike(f"{query}%"))
.where(key_skills_table.c.name.ilike(f"%{query}%"))
.order_by(key_skills_table.c.name)
.limit(30)
)
@@ -6,7 +6,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from template_project.adapters.data_gateways.tables import resume_prediction_table, resume_table
from template_project.application.resume.data_gateway import ResumeDataGateway, ResumePredictionDataGateway
from template_project.application.resume.entity import Resume, ResumeEmbeddingId, ResumeId, ResumePrediction
from template_project.application.resume.entity import Resume, ResumeId, ResumePrediction
from template_project.application.resume.errors import ResumeNotFoundError
from template_project.application.user.entity import UserId
@@ -15,10 +15,6 @@ class DefaultResumeDataGateway(ResumeDataGateway):
def __init__(self, session: AsyncSession) -> None:
self._session = session
@override
async def get_suitable_resumes(self, embedding_id: ResumeEmbeddingId) -> Sequence[Resume]:
raise NotImplementedError
@override
async def load(self, resume_id: ResumeId) -> Resume:
resume = await self._session.get(Resume, resume_id)
@@ -3,7 +3,6 @@ from typing import Any, Final, override
from pgvector.sqlalchemy import Vector
from sqlalchemy import (
ARRAY,
Boolean,
Column,
DateTime,
@@ -23,6 +22,7 @@ from sqlalchemy.orm import registry
from template_project.application.access_token.entity import AccessToken
from template_project.application.auth_identity.entity import AuthIdentity, AuthMethod
from template_project.application.common.enums import ExperienceType
from template_project.application.notification_device.entity import NotificationDevice
from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction
from template_project.application.user.entity import User
@@ -160,6 +160,28 @@ key_skills_table: Final = Table(
Column("id", Integer, autoincrement=True, primary_key=True),
Column("name", String, nullable=False, unique=True)
)
vacancy_table: Final = Table(
"vacancies",
meta_data,
Column("id", UUID, primary_key=True),
Column("deleted_at", DateTime(timezone=True)),
Column("created_at", DateTime(timezone=True), nullable=False),
Column("position", String, nullable=False),
Column("from_salary", Numeric, nullable=False),
Column("to_salary", Numeric, nullable=False),
Column("experience_type", Enum(ExperienceType), nullable=False),
Column("description", nullable=False),
Column("key_skills", nullable=False),
)
vacancy_embedding_table: Final = Table(
"vacancy_embedding",
meta_data,
Column("id", UUID, primary_key=True),
Column("deleted_at", DateTime(timezone=True)),
Column("created_at", DateTime(timezone=True), nullable=False),
Column("vacancy_id", UUID, ForeignKey("vacancies.id", ondelete="CASCADE"), nullable=False),
Column("vector", Vector, nullable=False),
)
mapper_registry.map_imperatively(User, user_table)
@@ -0,0 +1,24 @@
from collections.abc import Sequence
from typing import override
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from template_project.adapters.data_gateways.tables import vacancy_embedding_table, vacancy_table
from template_project.application.vacancy.data_gateway import VacancyDataGateway
from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding
class DefaultVacancyDataGateway(VacancyDataGateway):
def __init__(self, session: AsyncSession) -> None:
self._session = session
@override
async def get_suitable(self, vector: list[float]) -> Sequence[Vacancy]:
statement = (
select(Vacancy)
.join(VacancyEmbedding, vacancy_embedding_table.c.id == vacancy_table.c.id)
.where(vacancy_embedding_table.c.vector.cosine_distance(vector) > 0.5)
)
result = await self._session.execute(statement)
return result.scalars().all()
@@ -0,0 +1,12 @@
from typing import cast
from httpx import AsyncClient
class MlApiGateway:
def __init__(self, client: AsyncClient) -> None:
self._client = client
async def generate_embedding(self, text: str) -> list[float]:
response = await self._client.post("/get_embedding", json={"text": text})
return cast(list[float], response.json()["embedding"])
@@ -0,0 +1,36 @@
from typing import Final, override
from template_project.application.common.enums import ExperienceType
from template_project.adapters.ml_client import MlApiGateway
from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator
EMBEDDING_TEXT_TEMPLATE: Final = """
Позиция: {position}
Опыт: {experience_type}
Ключевые навыки: {key_skills}
Описание: {about_me}
"""
class DefaultResumeEmbeddingVectorGenerator(ResumeEmbeddingVectorGenerator):
def __init__(self, ml_api_gateway: MlApiGateway) -> None:
self._ml_api_gateway = ml_api_gateway
@override
async def generate(
self,
position: str,
about_me: str,
experience_type: ExperienceType,
key_skills: list[str],
) -> list[float]:
text = EMBEDDING_TEXT_TEMPLATE.format_map(
{
"position": position,
"experience_type": experience_type,
"key_skills": ", ".join(key_skills),
"about_me": about_me,
}
)
return await self._ml_api_gateway.generate_embedding(text)
@@ -4,7 +4,6 @@ from typing import Protocol
from template_project.application.resume.entity import (
Resume,
ResumeEmbeddingId,
ResumeId,
ResumePrediction,
)
@@ -12,10 +11,6 @@ from template_project.application.user.entity import UserId
class ResumeDataGateway(Protocol):
@abstractmethod
async def get_suitable_resumes(self, embedding_id: ResumeEmbeddingId) -> Sequence[Resume]:
raise NotImplementedError
@abstractmethod
async def load(self, resume_id: ResumeId) -> Resume:
raise NotImplementedError
@@ -3,19 +3,20 @@ from collections.abc import Callable
from Levenshtein import ratio
from template_project.application.common.unit_of_work import UnitOfWork
from template_project.application.resume.data_gateway import ResumeDataGateway
from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction
from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator
from template_project.application.vacancy.data_gateway import VacancyDataGateway
from template_project.application.vacancy.entity import Vacancy
def suitable_resumes_key(
def suitable_vacancies_key(
resume: Resume,
) -> Callable[[Resume], bool]:
def wrapper(suitable_resume: Resume) -> bool:
) -> Callable[[Vacancy], bool]:
def wrapper(vacancy: Vacancy) -> bool:
count_skills = 0
ratio_skill_sum = 0.0
for resum_key_skill in resume.key_skills:
for suitable_resume_key_skill in suitable_resume.key_skills:
for suitable_resume_key_skill in vacancy.key_skills:
ratio_skill = ratio(resum_key_skill, suitable_resume_key_skill)
if ratio_skill != 0:
count_skills += 1
@@ -26,7 +27,7 @@ def suitable_resumes_key(
except ZeroDivisionError:
matching_skills = 0
return resume.experience_type == suitable_resume.experience_type and matching_skills >= 50
return resume.experience_type == vacancy.experience_type and matching_skills >= 50
return wrapper
@@ -35,11 +36,11 @@ class ResumeEmbeddingPipeline:
def __init__(
self,
unit_of_work: UnitOfWork,
resume_data_gateway: ResumeDataGateway,
vacancy_data_gateway: VacancyDataGateway,
vector_generator: ResumeEmbeddingVectorGenerator,
) -> None:
self.unit_of_work = unit_of_work
self.resume_data_gateway = resume_data_gateway
self.vacancy_data_gateway = vacancy_data_gateway
self.vector_generator = vector_generator
async def run(
@@ -50,18 +51,20 @@ class ResumeEmbeddingPipeline:
position=resume.position,
about_me=resume.about_me,
key_skills=resume.key_skills,
experience_type=resume.experience_type,
)
resume_embedding = ResumeEmbedding.factory(
resume_id=resume.id,
vector=vector,
)
suitable_resumes = await self.resume_data_gateway.get_suitable_resumes(resume_embedding.id)
suitable_resumes_filtered = sorted(
suitable_resumes,
key=suitable_resumes_key(resume),
suitable_vacancies = await self.vacancy_data_gateway.get_suitable(resume_embedding.vector)
suitable_vacancies_filtered = sorted(
suitable_vacancies,
key=suitable_vacancies_key(resume),
)
suitable_resumes = suitable_resumes_filtered[:50]
suitable_vacancies = suitable_vacancies_filtered[:50]
# TODO: тут надо сделать отправку в ИИ
@@ -0,0 +1 @@
# class ResumePredicition
@@ -1,13 +1,15 @@
from abc import abstractmethod
from typing import Protocol
from template_project.application.common.enums import ExperienceType
class ResumeEmbeddingVectorGenerator(Protocol):
class ResumeEmbeddingVectorGenerator:
@abstractmethod
async def generate(
self,
position: str,
about_me: str,
experience_type: ExperienceType,
key_skills: list[str],
) -> list[float]:
raise NotImplementedError
@@ -0,0 +1,11 @@
from abc import abstractmethod
from collections.abc import Sequence
from typing import Protocol
from template_project.application.vacancy.entity import Vacancy
class VacancyDataGateway(Protocol):
@abstractmethod
async def get_suitable(self, vector: list[float]) -> Sequence[Vacancy]:
raise NotImplementedError
@@ -32,8 +32,7 @@ class PredictSalaryResponse:
class PredictSalaryInteractor:
async def execute(self, request: PredictSalaryRequest) -> PredictSalaryResponse:
return PredictSalaryResponse(
salary_from=Decimal("50000"),
salary_to=Decimal("80000"),
salary_from=Decimal(50000),
salary_to=Decimal(80000),
recommended_skills=["python", "django", "postgresql"],
)
@@ -9,4 +9,3 @@ class InteractorProvider(Provider):
interactors = provide_all(
PredictSalaryInteractor,
)
+1 -3
View File
@@ -2,14 +2,13 @@ from decimal import Decimal
from dishka import FromDishka
from dishka.integrations.fastapi import DishkaRoute
from fastapi import APIRouter, status
from fastapi import APIRouter
from pydantic import BaseModel, Field
from template_project.application.resume.entity import ResumeId
from template_project.ml.interactors.predict_salary import (
PredictSalaryInteractor,
PredictSalaryRequest,
PredictSalaryResponse,
VacancyInput,
)
@@ -120,4 +119,3 @@ async def predict_salary(
salary_to=response.salary_to,
recommended_skills=response.recommended_skills,
)
@@ -7,6 +7,7 @@ from template_project.adapters.data_gateways.notification_device import DefaultN
from template_project.adapters.data_gateways.profile import DefaultProfileDataGateway
from template_project.adapters.data_gateways.resume import DefaultResumeDataGateway, DefaultResumePredictionDataGateway
from template_project.adapters.data_gateways.user import DefaultUserDataGateway
from template_project.adapters.data_gateways.vacancy import DefaultVacancyDataGateway
from template_project.adapters.unit_of_work import DefaultUnitOfWork
@@ -16,6 +17,7 @@ class DataGatewayProvider(Provider):
unit_of_work = provide(WithParents[DefaultUnitOfWork])
data_gateways = provide_all(
KeySkillsDataGateway,
WithParents[DefaultVacancyDataGateway],
WithParents[DefaultUserDataGateway],
WithParents[DefaultAccessTokenDataGateway],
WithParents[DefaultAuthIdentityDataGateway],