{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "d8185790-0793-4881-99e8-6730f95a8006", "metadata": { "tags": [] }, "outputs": [], "source": [ "import json\n", "import pathlib\n", "\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9", "metadata": { "tags": [] }, "outputs": [], "source": [ "pd_options = {\n", " \"display.max_rows\": None,\n", "}\n", "\n", "[pd.set_option(option, value) for option, value in pd_options.items()]" ] }, { "cell_type": "code", "execution_count": null, "id": "36149580-91d9-431d-99c3-51feee829e79", "metadata": { "tags": [] }, "outputs": [], "source": [ "data_directory = (\n", " pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "d03be94e-8642-4916-8a43-1711e0c21b36", "metadata": { "tags": [] }, "outputs": [], "source": [ "countries_file = data_directory / \"countries.json\"\n", "countries = json.loads(countries_file.read_text())\n", "# countries" ] }, { "cell_type": "code", "execution_count": null, "id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd", "metadata": { "tags": [] }, "outputs": [], "source": [ "df = pd.read_json(countries_file)" ] }, { "cell_type": "code", "execution_count": null, "id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba", "metadata": { "tags": [] }, "outputs": [], "source": [ "country_url = df[\"country_url\"]\n", "print(country_url.shape)\n", "print(country_url.isnull().unique())\n", "print(country_url.isna().unique())" ] }, { "cell_type": "code", "execution_count": null, "id": "48db8f93-659b-45a4-8477-a7cec139bebc", "metadata": { "tags": [] }, "outputs": [], "source": [ "short_country_name = df[\"short_country_name\"]\n", "print(short_country_name.shape)\n", "print(short_country_name.isnull().unique())\n", "print(short_country_name.isna().unique())" ] }, { "cell_type": "code", "execution_count": null, "id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e", "metadata": { "tags": [] }, "outputs": [], "source": [ "flag_description = df[\"flag_description\"]\n", "print(flag_description.shape)\n", "print(flag_description.isnull().unique())\n", "print(flag_description.isna().unique())" ] }, { "cell_type": "code", "execution_count": null, "id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b", "metadata": { "tags": [] }, "outputs": [], "source": [ "no_anthem = [item for item in [df[\"files\"]][0] if len(item) == 1]\n", "print(len(no_anthem))\n", "no_anthem" ] }, { "cell_type": "code", "execution_count": null, "id": "227b0c76-9e45-4849-849e-36355976cba9", "metadata": { "tags": [] }, "outputs": [], "source": [ "df[\"short_country_name\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "toc-autonumbering": true, "toc-showcode": false }, "nbformat": 4, "nbformat_minor": 5 }