Files
geography-anki/playground/downloaded_data_inspection_lab/Untitled.ipynb
2022-06-24 21:00:00 +01:00

178 lines
3.7 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "d8185790-0793-4881-99e8-6730f95a8006",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"import pathlib\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea2b3e33-d58e-4e30-a0cc-8218a1f252c9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"pd_options = {\n",
" \"display.max_rows\": None,\n",
"}\n",
"\n",
"[pd.set_option(option, value) for option, value in pd_options.items()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36149580-91d9-431d-99c3-51feee829e79",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"data_directory = (\n",
" pathlib.Path(\".\").resolve().parents[1] / \"data\" / \"scrapy\" / \"raw_country_data\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d03be94e-8642-4916-8a43-1711e0c21b36",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"countries_file = data_directory / \"countries.json\"\n",
"countries = json.loads(countries_file.read_text())\n",
"# countries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29cca9ea-16d3-4534-8c9e-49fde37f8cdd",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df = pd.read_json(countries_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef8bc3ce-08dd-4260-807c-2616b2e1c1ba",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"country_url = df[\"country_url\"]\n",
"print(country_url.shape)\n",
"print(country_url.isnull().unique())\n",
"print(country_url.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48db8f93-659b-45a4-8477-a7cec139bebc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"short_country_name = df[\"short_country_name\"]\n",
"print(short_country_name.shape)\n",
"print(short_country_name.isnull().unique())\n",
"print(short_country_name.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a52f6aa2-5bbd-46e4-9b2f-cdbd7269cb6e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"flag_description = df[\"flag_description\"]\n",
"print(flag_description.shape)\n",
"print(flag_description.isnull().unique())\n",
"print(flag_description.isna().unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "643e6512-1e5b-4eb2-9f0a-6b680ada787b",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"no_anthem = [item for item in [df[\"files\"]][0] if len(item) == 1]\n",
"print(len(no_anthem))\n",
"no_anthem"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "227b0c76-9e45-4849-849e-36355976cba9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df[\"short_country_name\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7712d7d-9074-4fc5-89f2-6e5f47c57d20",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
},
"toc-autonumbering": true,
"toc-showcode": false
},
"nbformat": 4,
"nbformat_minor": 5
}