Please start the libraries:
library(tidyverse)
library(knitr)
library(lubridate)
library(ggthemes)
library(scico)
library(ggmap)
library(data.table)
library(sf)
library(tmap)
Check the working directory:
getwd()
We can read and hear more and more about big data or open data. Big
data we already covered (in reality both datasets are very big, here we
just looked small fragments).
What about open data? in Estonia the state-produced
linked open data is accessible through Open Data Portal of Estonia. One
of the important feature of this data is that it’s often machine
readable.
Lets look one of the datasets from Estonian Police and Border Guard Board. It contains information about crimes in Estonia. The dataset is updated weekly and freely available.
Download it:
crime_data <- read_delim("https://opendata.smit.ee/ppa/csv/avalik_1.csv",
"\t", escape_double = FALSE, col_types = cols(ToimKell = col_character(),
ToimKpv = col_character()), trim_ws = TRUE)
glimpse(crime_data)
## Rows: 9,893
## Columns: 18
## $ JuhtumId <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24~
## $ ToimKpv <chr> "2022-11-15", "2022-11-15", "2022-11-15", "202~
## $ ToimKell <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "~
## $ ToimNadalapaev <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisip~
## $ SyndmusLiik <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVAR~
## $ SyndmusTaiendavStatLiik <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA~
## $ Seadus <chr> "Karistusseadustik", "Karistusseadustik", "Kar~
## $ Paragrahv <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199~
## $ ParagrahvTais <chr> "§ 218. Varavastane süütegu väheväärtusliku as~
## $ Loige <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2~
## $ Kahjusumma <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0~
## $ KohtLiik <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", ~
## $ MaakondNimetus <chr> "Harju maakond", "Harju maakond", "Harju maako~
## $ ValdLinnNimetus <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn"~
## $ KohtNimetus <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe~
## $ Lest_X <chr> "6593000-6593999", "6589000-6589499", "6587000~
## $ Lest_Y <chr> "557000-557999", "542500-542999", "545000-5454~
## $ SyyteoLiik <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT"~
# sometimes there exists easier solution (in current case default download works without any additional parameters):
crime_data_2 <- fread("https://opendata.smit.ee/ppa/csv/avalik_1.csv", encoding = "UTF-8")
glimpse(crime_data_2)
## Rows: 9,893
## Columns: 18
## $ JuhtumId <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24~
## $ ToimKpv <date> 2022-11-15, 2022-11-15, 2022-11-15, 2022-11-1~
## $ ToimKell <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "~
## $ ToimNadalapaev <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisip~
## $ SyndmusLiik <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVAR~
## $ SyndmusTaiendavStatLiik <chr> "", "MOBIILTELEFONIVARGUS,TASKUVARGUS", "", ""~
## $ Seadus <chr> "Karistusseadustik", "Karistusseadustik", "Kar~
## $ Paragrahv <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199~
## $ ParagrahvTais <chr> "§ 218. Varavastane süütegu väheväärtusliku as~
## $ Loige <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2~
## $ Kahjusumma <chr> "0-499", "", "0-499", "0-499", "0-499", "", "0~
## $ KohtLiik <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", ~
## $ MaakondNimetus <chr> "Harju maakond", "Harju maakond", "Harju maako~
## $ ValdLinnNimetus <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn"~
## $ KohtNimetus <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe~
## $ Lest_X <chr> "6593000-6593999", "6589000-6589499", "6587000~
## $ Lest_Y <chr> "557000-557999", "542500-542999", "545000-5454~
## $ SyyteoLiik <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT"~
crime_data %>%
head() %>%
kable()
| JuhtumId | ToimKpv | ToimKell | ToimNadalapaev | SyndmusLiik | SyndmusTaiendavStatLiik | Seadus | Paragrahv | ParagrahvTais | Loige | Kahjusumma | KohtLiik | MaakondNimetus | ValdLinnNimetus | KohtNimetus | Lest_X | Lest_Y | SyyteoLiik |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5fe24546-c87c-18da-848a-e723ecc8c977 | 2022-11-15 | 21:28 | Teisipäev | PISIVARGUS | NA | Karistusseadustik | § 218. | § 218. Varavastane süütegu väheväärtusliku asja ja varalise õiguse vastu | lg. 1. | 0-499 | AVALIK_KOHT,KAUPLUS | Harju maakond | Maardu linn | Maardu linn | 6593000-6593999 | 557000-557999 | VT |
| 5fe24550-c87c-18da-848a-e723ecc8c977 | 2022-11-15 | 21:00 | Teisipäev | VARGUS | MOBIILTELEFONIVARGUS,TASKUVARGUS | Karistusseadustik | § 199. | § 199. Vargus | lg. 1. | NA | AVALIK_KOHT,KASIINO | Harju maakond | Tallinn | Kesklinna linnaosa | 6589000-6589499 | 542500-542999 | KT |
| 5fe2451e-c87c-18da-848a-e723ecc8c977 | 2022-11-15 | 15:05 | Teisipäev | PISIVARGUS | NA | Karistusseadustik | § 218. | § 218. Varavastane süütegu väheväärtusliku asja ja varalise õiguse vastu | lg. 1. | 0-499 | AVALIK_KOHT,KAUPLUS | Harju maakond | Tallinn | Lasnamäe linnaosa | 6587000-6587499 | 545000-545499 | VT |
| 5fe24564-c87c-18da-848a-e723ecc8c977 | 2022-11-15 | 14:25 | Teisipäev | PISIVARGUS | NA | Karistusseadustik | § 218. | § 218. Varavastane süütegu väheväärtusliku asja ja varalise õiguse vastu | lg. 1. | 0-499 | AVALIK_KOHT,KAUPLUS | Harju maakond | Tallinn | Lasnamäe linnaosa | 6587000-6587499 | 545000-545499 | VT |
| 5fe244e2-c87c-18da-848a-e723ecc8c977 | 2022-11-15 | 12:07 | Teisipäev | VARGUS | MUU_VARGUS | Karistusseadustik | § 199. | § 199. Vargus | lg. 2. | 0-499 | AVALIK_KOHT,KAUPLUS | Harju maakond | Tallinn | Haabersti linnaosa | 6587500-6587999 | 537000-537499 | KT |
| 5fe244c4-c87c-18da-848a-e723ecc8c977 | 2022-11-15 | 10:54 | Teisipäev | PISIVARGUS | NA | Karistusseadustik | § 218. | § 218. Varavastane süütegu väheväärtusliku asja ja varalise õiguse vastu | lg. 1. | NA | AVALIK_KOHT,KAUPLUS | Harju maakond | Tallinn | Nõmme linnaosa | 6582000-6582499 | 540000-540499 | VT |
Unfortunately the table is in Estonian, but hopefully we manage to analyse it! First we translate the column names to English:
crime_names <- colnames(crime_data)
crime_names_en <-c("CaseId",
"Date",
"Time",
"Weekday",
"CaseType",
"CaseTypeAdditional",
"Law",
"Paragraph",
"ParagraphFull",
"Section",
"DamagesEuro",
"PlaceType",
"County",
"Municipality",
"Place",
"Lest_X",
"Lest_Y",
"Type")
data_frame("Columns, et" = crime_names, "Columns, en" = crime_names_en) %>%
kable()
| Columns, et | Columns, en |
|---|---|
| JuhtumId | CaseId |
| ToimKpv | Date |
| ToimKell | Time |
| ToimNadalapaev | Weekday |
| SyndmusLiik | CaseType |
| SyndmusTaiendavStatLiik | CaseTypeAdditional |
| Seadus | Law |
| Paragrahv | Paragraph |
| ParagrahvTais | ParagraphFull |
| Loige | Section |
| Kahjusumma | DamagesEuro |
| KohtLiik | PlaceType |
| MaakondNimetus | County |
| ValdLinnNimetus | Municipality |
| KohtNimetus | Place |
| Lest_X | Lest_X |
| Lest_Y | Lest_Y |
| SyyteoLiik | Type |
Replace column names and select relevant (more interesting variables):
colnames(crime_data) <- crime_names_en
glimpse(crime_data)
## Rows: 9,893
## Columns: 18
## $ CaseId <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date <chr> "2022-11-15", "2022-11-15", "2022-11-15", "2022-11-~
## $ Time <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ Lest_X <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ Type <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~
To check what variable categories are available we can use command
distinct():
LIST_caseType <- crime_data %>%
distinct(CaseType)
LIST_caseType %>%
head() %>%
kable()
| CaseType |
|---|
| PISIVARGUS |
| VARGUS |
| PISIVARGUS,VARGUS |
| VANDALISM |
| LITTER,PISIVARGUS |
| AVALIKU_KORRA_RIKKUMINE |
Next we should check the frequency of different categories:
TOP_caseType <- crime_data %>%
group_by(CaseType) %>%
summarise(n=n()) %>%
arrange(desc(n))
TOP_caseType %>%
head()
## # A tibble: 6 x 2
## CaseType n
## <chr> <int>
## 1 VARGUS 4359
## 2 PISIVARGUS 2560
## 3 AVALIKU_KORRA_RIKKUMINE 750
## 4 VANDALISM 636
## 5 JALGRATTA_MOPEEDI_VARGUS 349
## 6 MUU 179
As we see once again, the working language of the Estonian police is Estonian. But we can translate the top 5 crime types easily to English:
| crime type, et | link to google translate |
|---|---|
| VARGUS | https://translate.google.com/#et/en/vargus |
| PISIVARGUS | https://translate.google.com/#et/en/pisivargus |
| AVALIKU_KORRA_RIKKUMINE | https://translate.google.com/#et/en/avaliku_korra_rikkumine |
| VANDALISM | https://translate.google.com/#et/en/vandalism |
| JALGRATTA_MOPEEDI_VARGUS | https://translate.google.com/#et/en/jalgratta_mopeedi_vargus |
Traditional glimpse at the data:
glimpse(crime_data)
## Rows: 9,893
## Columns: 18
## $ CaseId <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date <chr> "2022-11-15", "2022-11-15", "2022-11-15", "2022-11-~
## $ Time <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ Lest_X <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ Type <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~
As it appears all the variables are imported as character. We have to convert columns to proper format. First we convert date, calculate weekdays and also time in hours:
# R gives the weekdays in language defined in your computer settings
# Let's switch to english:
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
crime_data <- crime_data %>%
mutate(Date = ymd(Date),
wday = lubridate::wday(Date, abbr = T, label =T),
hour = as.integer(substr(Time, 1, 2)))
# change back to your language:
Sys.setlocale("LC_TIME","Estonian") # in current case it's estonian
## [1] "Estonian_Estonia.1257"
glimpse(crime_data)
## Rows: 9,893
## Columns: 20
## $ CaseId <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date <date> 2022-11-15, 2022-11-15, 2022-11-15, 2022-11-15, 20~
## $ Time <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ Lest_X <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ Type <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~
## $ wday <ord> Tue, Tue, Tue, Tue, Tue, Tue, Tue, Mon, Mon, Mon, M~
## $ hour <int> 21, 21, 15, 14, 12, 10, 1, 23, 19, 17, 16, 13, 12, ~
Crimes dataset also contains spatial information (L-EST97; epsg = 3301). Currently the coordinates are stored in character field and given as range (coordinates are gridded). Our task is to convert the pixel/cell extent to centroid:
crime_data <- crime_data %>%
mutate(Lest_X_bu = Lest_X, Lest_Y_bu = Lest_Y) # back up coordinate field, because 'separate' will delete the original field
crime_data <- crime_data %>%
separate(Lest_X, c("x_min", "x_max"), sep = "-") # split column
crime_data <- crime_data %>%
separate(Lest_Y, c("y_min", "y_max"), sep = "-")
crime_data <- crime_data %>%
mutate(x_min = as.integer(x_min),
x_max = as.integer(x_max),
y_min = as.integer(y_min),
y_max = as.integer(y_max))
# calculate centroid:
crime_data <- crime_data %>%
mutate(x=(x_min + x_max) / 2, y=(y_min + y_max) / 2)
glimpse(crime_data)
## Rows: 9,893
## Columns: 26
## $ CaseId <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date <date> 2022-11-15, 2022-11-15, 2022-11-15, 2022-11-15, 20~
## $ Time <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ x_min <int> 6593000, 6589000, 6587000, 6587000, 6587500, 658200~
## $ x_max <int> 6593999, 6589499, 6587499, 6587499, 6587999, 658249~
## $ y_min <int> 557000, 542500, 545000, 545000, 537000, 540000, 544~
## $ y_max <int> 557999, 542999, 545499, 545499, 537499, 540499, 544~
## $ Type <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~
## $ wday <ord> Tue, Tue, Tue, Tue, Tue, Tue, Tue, Mon, Mon, Mon, M~
## $ hour <int> 21, 21, 15, 14, 12, 10, 1, 23, 19, 17, 16, 13, 12, ~
## $ Lest_X_bu <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y_bu <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ x <dbl> 6593500, 6589250, 6587250, 6587250, 6587750, 658225~
## $ y <dbl> 557499.5, 542749.5, 545249.5, 545249.5, 537249.5, 5~
Plot the distribution of crimes in Estonia:
ggplot()+
geom_point(data = crime_data, aes(x=x, y=y), size=2, alpha=0.2)
## Warning: Removed 90 rows containing missing values (geom_point).
Well… It should remind us the contour of Estonia. But it does not!
Let’s download Estonian contour from Estonian Land Board:
download.file("https://geoportaal.maaamet.ee/docs/haldus_asustus/maakond_shp.zip", destfile="maakond_shp.zip")
#maakond means county!
unzip("maakond_shp.zip")
# correct name for the downloaded shp-layer:
list.files(pattern = ".shp")
## [1] "asustusyksus_20211001.shp" "asustusyksus_20211101.shp"
## [3] "asustusyksus_20221101.shp" "eestimaa_wgs84.shp"
## [5] "eestimaa_wgs84.shp.xml" "gps_us.shp"
## [7] "gps_us_monterey.shp" "maakond_20210901.shp"
## [9] "maakond_20211101.shp" "maakond_20221001.shp"
## [11] "maakond_20221101.shp" "maakond_shp.zip"
## [13] "omavalitsus_20211001.shp" "omavalitsus_20221101.shp"
## [15] "omavalitsus_shp.zip" "population_2017.shp"
## [17] "trt_cont.shp"
county <- st_read("maakond_20211101.shp")# read shp to R:
## Reading layer `maakond_20211101' from data source
## `C:\ANTO\loengud\geopythonR\rspatial_Git\rspatial\maakond_20211101.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 15 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 369032.1 ymin: 6377141 xmax: 739152.8 ymax: 6634019
## Projected CRS: Estonian Coordinate System of 1997
ggplot()+
geom_sf(data = county)
# plotting was a bit slow, check the object size:
object.size(county)
## 17276056 bytes
# reduce the layer size:
county <- st_simplify(county, preserveTopology = T, dTolerance = 200) # ?st_simplify
object.size(county) # Smaller = faster!
## 1076304 bytes
Plot crimes data on top of county borders:
ggplot()+
geom_sf(data = county)+
geom_point(data = crime_data, aes(x=x, y=y), size=2, alpha=0.2, colour = "red")
## Warning: Removed 90 rows containing missing values (geom_point).
Well, looks like coordinate field names are switched in case of crimes dataset. Rename the coordinates columns to avoid mess:
# rename
crime_data <- crime_data %>%
rename(tmp = y) %>%
rename(y = x) %>%
rename(x = tmp)
# plot:
ggplot()+
geom_sf(data = county, colour = "grey", fill="grey90", size=0.25)+
geom_point(data = crime_data, aes(x = x, y = y), size = 0.5, alpha = 0.25, shape = 15, colour = "red")
## Warning: Removed 90 rows containing missing values (geom_point).
It worked! Spatial pattern of crimes correlates nicely with population density. We can reduce the dataset by aggregating it to grid:
glimpse(crime_data)
## Rows: 9,893
## Columns: 26
## $ CaseId <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date <date> 2022-11-15, 2022-11-15, 2022-11-15, 2022-11-15, 20~
## $ Time <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ x_min <int> 6593000, 6589000, 6587000, 6587000, 6587500, 658200~
## $ x_max <int> 6593999, 6589499, 6587499, 6587499, 6587999, 658249~
## $ y_min <int> 557000, 542500, 545000, 545000, 537000, 540000, 544~
## $ y_max <int> 557999, 542999, 545499, 545499, 537499, 540499, 544~
## $ Type <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~
## $ wday <ord> Tue, Tue, Tue, Tue, Tue, Tue, Tue, Mon, Mon, Mon, M~
## $ hour <int> 21, 21, 15, 14, 12, 10, 1, 23, 19, 17, 16, 13, 12, ~
## $ Lest_X_bu <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y_bu <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ y <dbl> 6593500, 6589250, 6587250, 6587250, 6587750, 658225~
## $ x <dbl> 557499.5, 542749.5, 545249.5, 545249.5, 537249.5, 5~
crime_data_grd_aggr <- crime_data %>%
group_by(x, y) %>%
summarise(n = n()) %>%
ungroup()
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
ggplot()+
geom_sf(data = county, colour = "grey", fill="grey90", size=0.25)+
geom_point(data = crime_data_grd_aggr, aes(x = x, y = y, colour = n, alpha = n))+
scale_colour_gradientn(colours = c("black", "red", "orange", "yellow"))
Interactive map could give a better overview? Try!
Convert data frame to sf-object:
crime_data_grd_aggr_sf <- st_as_sf(crime_data_grd_aggr, coords = c("x", "y"), crs = 3301)
Not working? Why? REad the Error message and solve it!
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(county)+
tm_polygons(col = "firebrick", border.col = "grey30", lwd = 0.2, alpha = 0.1)+
tm_shape(crime_data_grd_aggr_sf)+
tm_dots("n", palette = c("pink", "red", "black"), border.lwd = 0.1, style = "log10_pretty")
The general picture remained the same but the dataset is 10x smaller
(9893 rows vs 972 rows). The only problem with the previous
plot is that the cells with large values (crime hotspots) are buried
under other points.
Can we change the plotting order (bigger values on top)? Yes! We can
sort rows by value:
crime_data_grd_aggr <- crime_data_grd_aggr %>%
arrange(n)
ggplot()+
geom_sf(data = county, colour = "grey", fill="grey90", size=0.25)+
geom_point(data = crime_data_grd_aggr, aes(x = x, y = y, colour = n))+
scale_colour_gradientn(colours = c("black", "red", "orange", "yellow"))
Group data by weekdays and hours, count number of crimes by groups. Create a plot:
crime_data_aggr_hourWday <- crime_data %>%
group_by(wday, hour) %>%
summarise(n=n()) %>%
ungroup()
## `summarise()` has grouped output by 'wday'. You can override using the
## `.groups` argument.
ggplot()+
theme_minimal()+
geom_line(data = crime_data_aggr_hourWday, aes(x = hour, y=n))+
facet_wrap(~wday)+
scale_x_continuous(breaks = seq(0, 24, 3))
Result looks almost okay! Somehow R thinks, that week starts with weekend day (sunday). Therefore the first day of the week is Sunday. But we can change it! Let’s define Monday as the first day of the week:
crime_data %>%
distinct(wday)
## # A tibble: 7 x 1
## wday
## <ord>
## 1 Tue
## 2 Mon
## 3 Sun
## 4 Sat
## 5 Fri
## 6 Thu
## 7 Wed
crime_data$wday <- factor(crime_data$wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))
Let’s focus on public order violation (avaliku korra rikkumine):
crime_data_publicOrder <- crime_data %>%
filter(CaseType == "AVALIKU_KORRA_RIKKUMINE")
crime_data_aggr_hourWday <- crime_data %>%
group_by(wday, hour) %>%
summarise(n=n()) %>%
ungroup()
ggplot()+
theme_minimal()+
geom_line(data = crime_data_aggr_hourWday, aes(x = hour, y=n))+
facet_wrap(~wday)+
scale_x_continuous(breaks = seq(0, 24, 3))
Line graph is not the best type, because the weekdays are not connected with each other. Make a bar plot:
ggplot()+
theme_minimal()+
geom_col(data = crime_data_aggr_hourWday, aes(x = hour, y=n), fill="red")+
facet_wrap(~wday)+
scale_x_continuous(breaks = seq(0, 24, 3))
Many other possibilities:
ggplot()+
theme_minimal()+
geom_segment(data = crime_data_aggr_hourWday, aes(x = hour, xend=hour, y=0, yend = n), colour="red")+
geom_point(data = crime_data_aggr_hourWday, aes(x = hour, y=n), colour="red")+
facet_wrap(~wday)+
scale_x_continuous(breaks = seq(0, 24, 3))
Hard to compare? Put all days on the same plot:
ggplot()+
theme_minimal()+
geom_line(data = crime_data_aggr_hourWday, aes(x = hour, y=n, group=wday, colour=wday))+
scale_x_continuous(breaks = seq(0, 24, 3))+
scale_colour_manual(values = c("dodgerblue", "green", "blue", "grey", "orange", "red", "magenta"))
Still not very beautiful. We can calculate the mean for workdays and for weekend:
crime_data_aggr_hourWeekend <- crime_data_aggr_hourWday %>%
mutate(weekend = ifelse(wday == "Sat" | wday == "Sun", "weekend", "workday"))
crime_data_aggr_hourWeekend <- crime_data_aggr_hourWeekend %>%
group_by(weekend, hour) %>%
summarise(n=mean(n)) %>%
ungroup()
ggplot()+
theme_minimal()+
geom_line(data = crime_data_aggr_hourWeekend, aes(x=hour, y=n, colour=weekend))+
geom_point(data = crime_data_aggr_hourWeekend, aes(x=hour, y=n, colour=weekend), shape=21)+
labs(title="Average dynamics of crimes during the day", caption ="author: A. Aasa")+
scale_colour_manual(values = c("orange", "dodgerblue"))+
scale_x_continuous(breaks = seq(0, 24, 3))
We don’t have to stay on a national level, we can also “zoom in”. For example we can download base map for Tartu and put crimes on that map.
# Contour of Tartu:
download.file("http://aasa.ut.ee/Rspatial/data/trt_cont.zip", destfile = "trt_cont.zip")
unzip("trt_cont.zip")
trt_cont <- st_read("trt_cont.shp")# read shp to R:
## Reading layer `trt_cont' from data source
## `C:\ANTO\loengud\geopythonR\rspatial_Git\rspatial\trt_cont.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 4 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 655873.6 ymin: 6469758 xmax: 663717.9 ymax: 6477657
## Projected CRS: Estonian Coordinate System of 1997
st_crs(trt_cont) <- 3301
# use the sf object of crimes
glimpse(crime_data_grd_aggr_sf)
## Rows: 971
## Columns: 2
## $ n <int> 1, 1, 1, 1, 2, 1, 1, 1, 5, 5, 1, 17, 2, 6, 5, 1, 6, 1, 1, 1, ~
## $ geometry <POINT [m]> POINT (389499.5 6533500), POINT (396499.5 6448500), POI~
# clip to Tartu:
crime_data_grd_aggr_sf_trt <- st_intersection(crime_data_grd_aggr_sf, trt_cont) # spatial join
ggplot()+
theme_minimal()+
theme(legend.position = "right")+
geom_sf(data = trt_cont, fill="black")+
geom_sf(data = crime_data_grd_aggr_sf_trt, aes(colour = n, size=n), shape=15)+
scale_colour_gradientn(colours = c("darkgreen", "grey95", "orange"))
Author: Anto Aasa
Supervisors: Anto Aasa & Lika Zhvania
Geospatial analysis in R
LTOM.02.041
Last update: 2022-11-21 15:33:17