-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added cron to scrape github topic repositories
- Loading branch information
Showing
12 changed files
with
383 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
use error::Result; | ||
use sea_orm::prelude::Uuid; | ||
use store::{github_projects, github_repositories}; | ||
|
||
#[async_trait] | ||
pub trait DbServiceContract { | ||
/// | ||
/// Insert new project | ||
/// | ||
async fn insert_project(&self, project: github_projects::ActiveModel) -> Result<()>; | ||
|
||
/// | ||
/// Insert new repository | ||
/// | ||
async fn insert_repository(&self, repository: github_repositories::ActiveModel) -> Result<()>; | ||
|
||
/// | ||
/// Run update status of github topic repositories | ||
/// | ||
async fn update_projects(&self) -> Result<()>; | ||
} | ||
|
||
#[async_trait] | ||
pub trait DbRepositoryContract { | ||
/// | ||
/// Get all projects from github topic repositories that are not scraped | ||
/// | ||
async fn get_unscraped_projects(&self) -> Result<Vec<String>>; | ||
|
||
/// | ||
/// Get unscraped github topic repositories, return (Project Name, Repository Name, Project ID) | ||
/// | ||
async fn get_unscraped_repositories(&self) -> Result<Vec<(String, String, Uuid)>>; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
use super::contract::{DbRepositoryContract, DbServiceContract}; | ||
use cronus::{Job, Schedule}; | ||
use error::Result; | ||
use sdks::github::{ | ||
data::{GithubRepository, ProfileInfo}, | ||
GithubContract, | ||
}; | ||
use sea_orm::{prelude::Uuid, ActiveValue::NotSet, Set}; | ||
use store::{github_projects, github_repositories}; | ||
|
||
pub struct GithubTopicsScraper< | ||
Repository: DbRepositoryContract, | ||
Service: DbServiceContract, | ||
Github: GithubContract, | ||
> { | ||
repository: Repository, | ||
service: Service, | ||
github: Github, | ||
} | ||
|
||
impl< | ||
Repository: DbRepositoryContract + Send + Sync + 'static, | ||
Service: DbServiceContract + Send + Sync + 'static, | ||
Github: GithubContract + Send + Sync + 'static, | ||
> GithubTopicsScraper<Repository, Service, Github> | ||
{ | ||
/// | ||
/// Creates GithubTopicsScraper | ||
/// | ||
pub fn new(repository: Repository, service: Service, github: Github) -> Self { | ||
Self { | ||
repository, | ||
service, | ||
github, | ||
} | ||
} | ||
|
||
/// | ||
/// Cron job that runs once a week | ||
/// | ||
async fn cron_job(&self) -> Result<()> { | ||
self.handle_projects().await?; | ||
self.handle_repositories().await?; | ||
self.service.update_projects().await?; | ||
Ok(()) | ||
} | ||
|
||
/// | ||
/// Download unscraped repositories | ||
/// | ||
async fn handle_repositories(&self) -> Result<()> { | ||
let repositories = self.repository.get_unscraped_repositories().await?; | ||
|
||
for (project_name, repository_name, project_id) in repositories { | ||
if let Err(error) = self | ||
.handle_repository(project_name, repository_name, project_id) | ||
.await | ||
{ | ||
error!("{}", error); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
||
/// | ||
/// Download repository and store it in db | ||
/// | ||
async fn handle_repository( | ||
&self, | ||
project_name: String, | ||
repository_name: String, | ||
project_id: Uuid, | ||
) -> Result<()> { | ||
let GithubRepository { | ||
name, | ||
language, | ||
stargazers_count, | ||
forks_count, | ||
created_at, | ||
updated_at, | ||
archived, | ||
fork, | ||
} = self | ||
.github | ||
.get_repository(&project_name, &repository_name) | ||
.await?; | ||
|
||
let repository = github_repositories::ActiveModel { | ||
id: NotSet, | ||
project: Set(project_id), | ||
repository_name: Set(name), | ||
language: Set(language), | ||
stargazers_count: Set(stargazers_count), | ||
forks_count: Set(forks_count), | ||
created_at: Set(created_at), | ||
updated_at: Set(updated_at), | ||
archived: Set(archived), | ||
fork: Set(fork), | ||
}; | ||
|
||
self.service.insert_repository(repository).await?; | ||
Ok(()) | ||
} | ||
|
||
/// | ||
/// Download unscraped projects | ||
/// | ||
async fn handle_projects(&self) -> Result<()> { | ||
let projects = self.repository.get_unscraped_projects().await?; | ||
for project in projects { | ||
if let Err(error) = self.handle_project(project).await { | ||
error!("{}", error); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
||
/// | ||
/// Download project and store it in db | ||
/// | ||
async fn handle_project(&self, project: String) -> Result<()> { | ||
let ProfileInfo { | ||
profile_type, | ||
followers, | ||
site, | ||
} = self.github.get_profile(&project).await?; | ||
|
||
let project = github_projects::ActiveModel { | ||
id: NotSet, | ||
name: Set(project), | ||
profile_type: Set(Some(profile_type.to_string())), | ||
url: Set(match site.is_empty() { | ||
true => None, | ||
false => Some(site), | ||
}), | ||
followers: Set(followers), | ||
}; | ||
|
||
self.service.insert_project(project).await | ||
} | ||
} | ||
|
||
#[async_trait] | ||
impl<Repository, Service, Github> Job for GithubTopicsScraper<Repository, Service, Github> | ||
where | ||
Repository: DbRepositoryContract + Send + Sync + 'static, | ||
Service: DbServiceContract + Send + Sync + 'static, | ||
Github: GithubContract + Send + Sync + 'static, | ||
{ | ||
fn schedule(&self) -> Schedule { | ||
"0 0 0 * * Mon".parse().expect("Invalid schedule") | ||
} | ||
async fn job(&self) { | ||
if let Err(error) = self.cron_job().await { | ||
error!("{}", error); | ||
} | ||
} | ||
} |
5 changes: 5 additions & 0 deletions
5
backend/api/src/jobs/github_topics_scraper/infrastructure/mod.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
mod service; | ||
mod repository; | ||
|
||
pub use service::PgService; | ||
pub use repository::PgRepository; |
68 changes: 68 additions & 0 deletions
68
backend/api/src/jobs/github_topics_scraper/infrastructure/repository.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
use error::Result; | ||
use sea_orm::{ | ||
prelude::Uuid, DatabaseBackend, DatabaseConnection, EntityTrait, FromQueryResult, Statement, | ||
}; | ||
use std::sync::Arc; | ||
|
||
use super::super::contract::DbRepositoryContract; | ||
use store::topics_repositories; | ||
|
||
const UNSCRAPED_PROJECTS_QUERY: &str = include_str!("sql/unscraped_projects.sql"); | ||
const UNSCRAPED_REPOSITORIES_QUERY: &str = include_str!("sql/unscraped_repositories.sql"); | ||
|
||
pub struct PgRepository { | ||
conn: Arc<DatabaseConnection>, | ||
} | ||
|
||
impl PgRepository { | ||
pub fn new(conn: Arc<DatabaseConnection>) -> Self { | ||
Self { conn } | ||
} | ||
} | ||
|
||
#[async_trait] | ||
impl DbRepositoryContract for PgRepository { | ||
async fn get_unscraped_projects(&self) -> Result<Vec<String>> { | ||
let projects = topics_repositories::Entity::find() | ||
.from_raw_sql(Statement::from_string( | ||
DatabaseBackend::Postgres, | ||
UNSCRAPED_PROJECTS_QUERY, | ||
)) | ||
.into_model::<RepositoryOwner>() | ||
.all(self.conn.as_ref()) | ||
.await? | ||
.into_iter() | ||
.map(|repo| repo.project_name) | ||
.collect(); | ||
|
||
Ok(projects) | ||
} | ||
|
||
async fn get_unscraped_repositories(&self) -> Result<Vec<(String, String, Uuid)>> { | ||
let repositories = topics_repositories::Entity::find() | ||
.from_raw_sql(Statement::from_string( | ||
DatabaseBackend::Postgres, | ||
UNSCRAPED_REPOSITORIES_QUERY, | ||
)) | ||
.into_model::<Repository>() | ||
.all(self.conn.as_ref()) | ||
.await? | ||
.into_iter() | ||
.map(|repo| (repo.project_name, repo.repository_name, repo.repository_owner)) | ||
.collect(); | ||
|
||
Ok(repositories) | ||
} | ||
} | ||
|
||
#[derive(FromQueryResult)] | ||
pub struct RepositoryOwner { | ||
project_name: String, | ||
} | ||
|
||
#[derive(FromQueryResult)] | ||
pub struct Repository { | ||
project_name: String, | ||
repository_name: String, | ||
repository_owner: Uuid, | ||
} |
48 changes: 48 additions & 0 deletions
48
backend/api/src/jobs/github_topics_scraper/infrastructure/service.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
use error::Result; | ||
use sea_orm::{DatabaseConnection, EntityTrait, Statement, ConnectionTrait}; | ||
use std::sync::Arc; | ||
|
||
use super::super::contract::DbServiceContract; | ||
use store::{github_projects, github_repositories}; | ||
|
||
const UPDATE_STATEMENT: &str = include_str!("sql/update_statement.sql"); | ||
|
||
pub struct PgService { | ||
conn: Arc<DatabaseConnection>, | ||
} | ||
|
||
impl PgService { | ||
pub fn new(conn: Arc<DatabaseConnection>) -> Self { | ||
Self { conn } | ||
} | ||
} | ||
|
||
#[async_trait] | ||
impl DbServiceContract for PgService { | ||
async fn insert_project(&self, project: github_projects::ActiveModel) -> Result<()> { | ||
github_projects::Entity::insert(project) | ||
.exec(self.conn.as_ref()) | ||
.await?; | ||
|
||
Ok(()) | ||
} | ||
|
||
async fn insert_repository(&self, repository: github_repositories::ActiveModel) -> Result<()> { | ||
github_repositories::Entity::insert(repository) | ||
.exec(self.conn.as_ref()) | ||
.await?; | ||
|
||
Ok(()) | ||
} | ||
|
||
async fn update_projects(&self) -> Result<()> { | ||
self.conn | ||
.as_ref() | ||
.execute(Statement::from_string( | ||
sea_orm::DatabaseBackend::Postgres, | ||
UPDATE_STATEMENT, | ||
)) | ||
.await?; | ||
Ok(()) | ||
} | ||
} |
5 changes: 5 additions & 0 deletions
5
backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_projects.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
select distinct tr.repository_owner as project_name | ||
from topics_repositories tr | ||
full outer join github_projects gp on gp."name" = tr.repository_owner | ||
where tr.scraped = false | ||
and gp.id isnull |
6 changes: 6 additions & 0 deletions
6
backend/api/src/jobs/github_topics_scraper/infrastructure/sql/unscraped_repositories.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
select tr.repository_owner as project_name, | ||
tr.repository_name as repository_name, | ||
gp.id as repository_owner | ||
from topics_repositories tr | ||
inner join github_projects gp on gp."name" = tr.repository_owner | ||
where tr.scraped = false |
11 changes: 11 additions & 0 deletions
11
backend/api/src/jobs/github_topics_scraper/infrastructure/sql/update_statement.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
update topics_repositories | ||
set scraped = true | ||
from ( | ||
select tr.id | ||
from topics_repositories tr | ||
inner join github_projects gp on gp."name" = tr.repository_owner | ||
inner join github_repositories gr on gr.repository_name = tr.repository_name | ||
and gr.project = gp.id | ||
where not tr.scraped | ||
) as subquery | ||
where topics_repositories.id = subquery.id |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
use std::sync::Arc; | ||
|
||
use cronus::Cronus; | ||
use sdks::github::Github; | ||
use sea_orm::DatabaseConnection; | ||
|
||
use self::{ | ||
domain::GithubTopicsScraper, | ||
infrastructure::{PgRepository, PgService}, | ||
}; | ||
|
||
mod contract; | ||
mod domain; | ||
mod infrastructure; | ||
|
||
pub fn setup(cron: &Cronus, sea_pool: Arc<DatabaseConnection>) { | ||
let job = create_gr(sea_pool); | ||
cron.add(job).expect("Error adding job"); | ||
} | ||
|
||
fn create_gr( | ||
sea_pool: Arc<DatabaseConnection>, | ||
) -> GithubTopicsScraper<PgRepository, PgService, Github> { | ||
let repository = PgRepository::new(sea_pool.clone()); | ||
let service = PgService::new(sea_pool); | ||
let github_api_key = config::get("GITHUB_KEY").ok(); | ||
|
||
let github = match github_api_key { | ||
Some(api_key) => Github::new_with_auth(api_key), | ||
None => Github::default(), | ||
}; | ||
|
||
GithubTopicsScraper::new(repository, service, github) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.