diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2eeff7a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [2023] [Jiri Karlik] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Procfile b/Procfile new file mode 100644 index 0000000..d5df67f Binary files /dev/null and b/Procfile differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..ead952f --- /dev/null +++ b/README.md @@ -0,0 +1,68 @@ +# Jobs_aggregator + +Jobs_aggregator is an educational project that demonstrates web scraping using Selenium and BeautifulSoup modules to extract job data from the job portal Jobs.cz. The project also includes a web application built with the Django framework, with dynamic frontend elements implemented using JavaScript. + +## Project Goals + +The primary goals of this project are: + +- Learn Django framework by building the first web application +- Gain knowledge and experience in web scraping techniques +- Create a project that can be demonstrated during interviews + +## Functionality + +The current features of Jobs_aggregator include: + +- User authentication: Users must log in to access the scraping functionality. +- Customized authentication: Authentication function in Django is adjusted, and BruteBuster module is implemented to protect against brute force attacks. +- Job scraping: Users can select a job title and city to scrape data from Jobs.cz. +- Data storage: Scraped job data is stored in a SQLite3 database using Django models. +- Data rendering: The scraped data is rendered in an HTML table. + - Table filtering: Users can filter the table by text, salary (indicated/not indicated), and junior (junior included in the title). + - Table sorting: Users can sort the table by all headings, except URL, by clicking on the headers (switching between ascending and descending order). + +## Observations + +Throughout the project, there were instances where the direction and scope of the project evolved. The initial idea was to build a web app in Django that could scrape jobs automatically and provide users with rendered data. However, the project took a different path, allowing users to perform the scraping themselves, which required additional adjustments. + +For future projects, it is essential to clearly define the project goals, functionalities, and prioritize them accordingly to avoid ambiguity and ensure a smoother development process. + +## Installation and Usage + +To use Jobs_aggregator locally, follow these steps: + +1. Clone the repository: `git clone https://github.com/your-username/Jobs_aggregator.git` +2. Install the required dependencies: `pip install -r requirements.txt` +3. Edit the `decorators.py` file in the BruteBuster module: + - Locate the file at `brutebuster/decorators.py` + - In line 45, modify the code snippet as follows: + ```python + if fa.recent_failure(): + if fa.too_many_failures(): + fa.failures += 1 + fa.save() + return False # MODIFY HERE +4. Set up the database: `python manage.py migrate` +5. Start the development server: `python manage.py runserver` +6. Access the web application in your browser at `http://localhost:8000/dashboard` + +## Contributing + +Contributions to Jobs_aggregator are welcome! If you would like to contribute, please follow these steps: + +1. Fork the repository. +2. Create a new branch for your feature or bug fix. +3. Make your modifications. +4. Commit and push your changes to your forked repository. +5. Submit a pull request describing your changes. + +## License + +This project is licensed under the [MIT License](LICENSE). + +## Contact + +If you have any questions or need further assistance, please feel free to contact the project owner. + +Enjoy using Jobs_aggregator! diff --git a/jobs_aggregator/__init__.py b/jobs_aggregator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/jobs_aggregator/asgi.py b/jobs_aggregator/asgi.py new file mode 100644 index 0000000..4a068db --- /dev/null +++ b/jobs_aggregator/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for jobs_aggregator project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'jobs_aggregator.settings') + +application = get_asgi_application() diff --git a/jobs_aggregator/settings.py b/jobs_aggregator/settings.py new file mode 100644 index 0000000..548ed08 --- /dev/null +++ b/jobs_aggregator/settings.py @@ -0,0 +1,127 @@ +""" +Django settings for jobs_aggregator project. + +Generated by 'django-admin startproject' using Django 4.2.1. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/4.2/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-fphcl#p=us6e_h!@ggvrwkv!8rd(s)$sv%2c9umb(8(0bu7y#m' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'jobs_dashboard', + 'BruteBuster' +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', + 'BruteBuster.middleware.RequestMiddleware' +] + +ROOT_URLCONF = 'jobs_aggregator.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'jobs_aggregator.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/4.2/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/4.2/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/4.2/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' +ALLOWED_HOSTS = ['192.168.0.26', 'localhost', '127.0.0.1'] diff --git a/jobs_aggregator/urls.py b/jobs_aggregator/urls.py new file mode 100644 index 0000000..c96df4a --- /dev/null +++ b/jobs_aggregator/urls.py @@ -0,0 +1,23 @@ +""" +URL configuration for jobs_aggregator project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/4.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import include, path + +urlpatterns = [ + path('admin/', admin.site.urls), + path('', include("jobs_dashboard.urls")) +] diff --git a/jobs_aggregator/wsgi.py b/jobs_aggregator/wsgi.py new file mode 100644 index 0000000..93d0a8c --- /dev/null +++ b/jobs_aggregator/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for jobs_aggregator project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'jobs_aggregator.settings') + +application = get_wsgi_application() diff --git a/jobs_dashboard/__init__.py b/jobs_dashboard/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/jobs_dashboard/admin.py b/jobs_dashboard/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/jobs_dashboard/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/jobs_dashboard/apps.py b/jobs_dashboard/apps.py new file mode 100644 index 0000000..cbffbf3 --- /dev/null +++ b/jobs_dashboard/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class JobsDashboardConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'jobs_dashboard' diff --git a/jobs_dashboard/config.py b/jobs_dashboard/config.py new file mode 100644 index 0000000..10ea6c7 --- /dev/null +++ b/jobs_dashboard/config.py @@ -0,0 +1,8 @@ +import os + +from django.core.wsgi import get_wsgi_application + +os.environ['DJANGO_SETTINGS_MODULE'] = 'jobs_aggregator.settings' +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jobs_aggregator.settings") + +application = get_wsgi_application() diff --git a/jobs_dashboard/forms.py b/jobs_dashboard/forms.py new file mode 100644 index 0000000..859c551 --- /dev/null +++ b/jobs_dashboard/forms.py @@ -0,0 +1,11 @@ +from django import forms +from .models import JudgeDetail + +class JudgesForm(forms.ModelForm): + class Meta: + model = JudgeDetail #Model only used to creat form fields atm + fields = '__all__' + widgets = { + 'password': forms.PasswordInput(), + + } \ No newline at end of file diff --git a/jobs_dashboard/jobs_scraper.py b/jobs_dashboard/jobs_scraper.py new file mode 100644 index 0000000..2d97e6d --- /dev/null +++ b/jobs_dashboard/jobs_scraper.py @@ -0,0 +1,125 @@ +from bs4 import BeautifulSoup +from selenium import webdriver +from datetime import date, timedelta +from .support import parse_czech_date +from jobs_dashboard.models import Job +from selenium.webdriver.chrome.service import Service +import logging + +# Setup logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Chrome options for Selenium WebDriver +chrome_options = webdriver.ChromeOptions() +chrome_options.add_argument("--no-sandbox") +chrome_options.add_argument("--headless") +chrome_options.add_argument("--disable-gpu") + +def save_page_source(page_source, page_num): + """ + Saves the page source to an HTML file. + """ + try: + with open(f"page_{page_num}.html", "w", encoding="utf-8") as file: + file.write(page_source) + logger.info(f"Saved page {page_num} as page_{page_num}.html") + except Exception as e: + logger.error(f"Failed to save page {page_num}: {e}") + +def scrape_jobsCZ(URL): + """ + Scrapes data from all URL subpages. + """ + data_total = [] + page_num = 1 + + # Use context manager for WebDriver to ensure it closes properly + with webdriver.Chrome(options=chrome_options) as driver: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": headers["User-Agent"]}) + + try: + while True: + driver.get(URL + str(page_num)) + page = driver.page_source + #save_page_source(page, page_num) + soup = BeautifulSoup(page, "html.parser") + data_temp = soup.find_all('article', {"class": "SearchResultCard"}) + + if soup.find_all('div', {"class": "Alert Alert--informative Alert--center mt-800 mb-600"}): + break + + data_total.extend(data_temp) + page_num += 1 + + except Exception as e: + logger.error(f"An error occurred while scraping data: {e}") + + return data_total + +def scrape_data(city, title, user): + """ + Splits data into variables and formats it. + """ + logger.info("Scrape called") + clean_data(user) + + try: + url = f"https://www.jobs.cz/prace/{city}/?q%5B%5D={title}&locality[radius]=0&page=" + data = scrape_jobsCZ(url) + today = date.today() + + for item in data: + # Extract and clean salary data + salary_data = item.find("span", {"class": "Tag Tag--success Tag--small Tag--subtle"}) + salary_data = salary_data.string if salary_data else "N/A" + + # Extract and format published date + published = item.find("div", {"class": "SearchResultCard__status SearchResultCard__status--default"}) + if published: + published = published.string + if "včera" in published: + published = (today - timedelta(days=1)).strftime("%d.%m.") + elif "Přidáno" in published: + published = today.strftime("%d.%m.") + elif "Aktualizováno" in published: + published = "Aktualizováno " + today.strftime("%d.%m.") + else: + published = parse_czech_date(published) + else: + published = item.find("div", {"class": "SearchResultCard__status SearchResultCard__status--danger"}) + published = published.string if published else "Unknown" + + # Extract title, company, and link + title = item.find("a", {"class": "link-primary SearchResultCard__titleLink"}).string + company = item.find("li", {"class": "SearchResultCard__footerItem"}).find("span").string + link = item.find("a", {"class": "link-primary SearchResultCard__titleLink"})["href"] + unique_id = f"{title.lower().replace(' ', '')}_{company.lower().replace(' ', '')}" + + # Save job to the database + job = Job( + title=title, + company=company, + link=link, + salary=salary_data, + published_date=published, + unique_id=unique_id, + user=user + ) + job.save() + + except Exception as e: + logger.error(f"An error occurred while processing data: {e}") + +def clean_data(user): + """ + Cleans up existing job data for the user. + """ + try: + Job.objects.filter(user=user).delete() + except Exception as e: + logger.info("No data to delete or an error occurred while deleting data.") + logger.error(e) diff --git a/jobs_dashboard/migrations/0001_initial.py b/jobs_dashboard/migrations/0001_initial.py new file mode 100644 index 0000000..4793f23 --- /dev/null +++ b/jobs_dashboard/migrations/0001_initial.py @@ -0,0 +1,35 @@ +# Generated by Django 4.2.1 on 2024-08-06 16:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Job', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('title', models.CharField(default='N/A', max_length=255)), + ('company', models.CharField(default='N/A', max_length=255)), + ('link', models.CharField(max_length=255, null=True)), + ('salary', models.CharField(max_length=255, null=True)), + ('published_date', models.CharField(max_length=255, null=True)), + ('unique_id', models.CharField(max_length=255, null=True)), + ('user', models.CharField(max_length=255, null=True)), + ], + ), + migrations.CreateModel( + name='JudgeDetail', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('username', models.CharField(max_length=50)), + ('password', models.CharField(max_length=50)), + ], + ), + ] diff --git a/jobs_dashboard/migrations/__init__.py b/jobs_dashboard/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/jobs_dashboard/models.py b/jobs_dashboard/models.py new file mode 100644 index 0000000..7b15396 --- /dev/null +++ b/jobs_dashboard/models.py @@ -0,0 +1,17 @@ +from django.db import models + +# Create your models here. + +class JudgeDetail(models.Model): + username = models.CharField(max_length=50) + password = models.CharField(max_length=50) + +class Job(models.Model): + title = models.CharField(max_length=255, default='N/A') + company = models.CharField(max_length=255, default='N/A') + link = models.CharField(max_length=255, null=True) + salary = models.CharField(max_length=255, null=True) + published_date = models.CharField(max_length=255, null=True) + unique_id = models.CharField(max_length=255, null=True) + user = models.CharField(max_length=255, null=True) + diff --git a/jobs_dashboard/support.py b/jobs_dashboard/support.py new file mode 100644 index 0000000..adfe5b7 --- /dev/null +++ b/jobs_dashboard/support.py @@ -0,0 +1,10 @@ +def parse_czech_date(input): + months_cs = ["ledna","února","března","dubna","května","června","července","srpna","září","října","listopadu","prosince"] + input = str(input) + input = input.replace("\n","") + input = input.replace(" ","") + + for i , month in enumerate(months_cs): + if month in input: + return input.split(".")[0] + "." + str(i) + "." + return "Date not Found" \ No newline at end of file diff --git a/jobs_dashboard/templates/home.html b/jobs_dashboard/templates/home.html new file mode 100644 index 0000000..d5561bf --- /dev/null +++ b/jobs_dashboard/templates/home.html @@ -0,0 +1,48 @@ + + +{% load static %} +
+ + + + + +This is educational project created by
+ Jiri Karlik.
Title | +Salary | +Company | +Published | +URL | +
---|---|---|---|---|
{{ job.title }} | +{{ job.salary }} | +{{ job.company }} | +{{ job.published_date }} | +URL | +