Please start the libraries:

library(tidyverse)
library(knitr)
library(lubridate)
library(ggthemes)
library(scico)
library(ggmap)
library(data.table)
library(sf)
library(tmap)

Check the working directory:

getwd()

Crime in Estonia

We can read and hear more and more about big data or open data. Big data we already covered (in reality both datasets are very big, here we just looked small fragments).
What about open data? in Estonia the state-produced linked open data is accessible through Open Data Portal of Estonia. One of the important feature of this data is that it’s often machine readable.

Lets look one of the datasets from Estonian Police and Border Guard Board. It contains information about crimes in Estonia. The dataset is updated weekly and freely available.

Download it:

crime_data <- read_delim("https://opendata.smit.ee/ppa/csv/avalik_1.csv", 
                    "\t", escape_double = FALSE, col_types = cols(ToimKell = col_character(),
                                                                  ToimKpv = col_character()), trim_ws = TRUE)
glimpse(crime_data)
## Rows: 9,893
## Columns: 18
## $ JuhtumId                <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24~
## $ ToimKpv                 <chr> "2022-11-15", "2022-11-15", "2022-11-15", "202~
## $ ToimKell                <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "~
## $ ToimNadalapaev          <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisip~
## $ SyndmusLiik             <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVAR~
## $ SyndmusTaiendavStatLiik <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA~
## $ Seadus                  <chr> "Karistusseadustik", "Karistusseadustik", "Kar~
## $ Paragrahv               <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199~
## $ ParagrahvTais           <chr> "§ 218. Varavastane süütegu väheväärtusliku as~
## $ Loige                   <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2~
## $ Kahjusumma              <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0~
## $ KohtLiik                <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", ~
## $ MaakondNimetus          <chr> "Harju maakond", "Harju maakond", "Harju maako~
## $ ValdLinnNimetus         <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn"~
## $ KohtNimetus             <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe~
## $ Lest_X                  <chr> "6593000-6593999", "6589000-6589499", "6587000~
## $ Lest_Y                  <chr> "557000-557999", "542500-542999", "545000-5454~
## $ SyyteoLiik              <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT"~
# sometimes there exists easier solution (in current case default download works without any additional parameters):
crime_data_2 <- fread("https://opendata.smit.ee/ppa/csv/avalik_1.csv", encoding = "UTF-8")
glimpse(crime_data_2)
## Rows: 9,893
## Columns: 18
## $ JuhtumId                <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24~
## $ ToimKpv                 <date> 2022-11-15, 2022-11-15, 2022-11-15, 2022-11-1~
## $ ToimKell                <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "~
## $ ToimNadalapaev          <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisip~
## $ SyndmusLiik             <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVAR~
## $ SyndmusTaiendavStatLiik <chr> "", "MOBIILTELEFONIVARGUS,TASKUVARGUS", "", ""~
## $ Seadus                  <chr> "Karistusseadustik", "Karistusseadustik", "Kar~
## $ Paragrahv               <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199~
## $ ParagrahvTais           <chr> "§ 218. Varavastane süütegu väheväärtusliku as~
## $ Loige                   <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2~
## $ Kahjusumma              <chr> "0-499", "", "0-499", "0-499", "0-499", "", "0~
## $ KohtLiik                <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", ~
## $ MaakondNimetus          <chr> "Harju maakond", "Harju maakond", "Harju maako~
## $ ValdLinnNimetus         <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn"~
## $ KohtNimetus             <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe~
## $ Lest_X                  <chr> "6593000-6593999", "6589000-6589499", "6587000~
## $ Lest_Y                  <chr> "557000-557999", "542500-542999", "545000-5454~
## $ SyyteoLiik              <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT"~
crime_data %>% 
  head() %>% 
  kable()
JuhtumId ToimKpv ToimKell ToimNadalapaev SyndmusLiik SyndmusTaiendavStatLiik Seadus Paragrahv ParagrahvTais Loige Kahjusumma KohtLiik MaakondNimetus ValdLinnNimetus KohtNimetus Lest_X Lest_Y SyyteoLiik
5fe24546-c87c-18da-848a-e723ecc8c977 2022-11-15 21:28 Teisipäev PISIVARGUS NA Karistusseadustik § 218. § 218. Varavastane süütegu väheväärtusliku asja ja varalise õiguse vastu lg. 1. 0-499 AVALIK_KOHT,KAUPLUS Harju maakond Maardu linn Maardu linn 6593000-6593999 557000-557999 VT
5fe24550-c87c-18da-848a-e723ecc8c977 2022-11-15 21:00 Teisipäev VARGUS MOBIILTELEFONIVARGUS,TASKUVARGUS Karistusseadustik § 199. § 199. Vargus lg. 1. NA AVALIK_KOHT,KASIINO Harju maakond Tallinn Kesklinna linnaosa 6589000-6589499 542500-542999 KT
5fe2451e-c87c-18da-848a-e723ecc8c977 2022-11-15 15:05 Teisipäev PISIVARGUS NA Karistusseadustik § 218. § 218. Varavastane süütegu väheväärtusliku asja ja varalise õiguse vastu lg. 1. 0-499 AVALIK_KOHT,KAUPLUS Harju maakond Tallinn Lasnamäe linnaosa 6587000-6587499 545000-545499 VT
5fe24564-c87c-18da-848a-e723ecc8c977 2022-11-15 14:25 Teisipäev PISIVARGUS NA Karistusseadustik § 218. § 218. Varavastane süütegu väheväärtusliku asja ja varalise õiguse vastu lg. 1. 0-499 AVALIK_KOHT,KAUPLUS Harju maakond Tallinn Lasnamäe linnaosa 6587000-6587499 545000-545499 VT
5fe244e2-c87c-18da-848a-e723ecc8c977 2022-11-15 12:07 Teisipäev VARGUS MUU_VARGUS Karistusseadustik § 199. § 199. Vargus lg. 2. 0-499 AVALIK_KOHT,KAUPLUS Harju maakond Tallinn Haabersti linnaosa 6587500-6587999 537000-537499 KT
5fe244c4-c87c-18da-848a-e723ecc8c977 2022-11-15 10:54 Teisipäev PISIVARGUS NA Karistusseadustik § 218. § 218. Varavastane süütegu väheväärtusliku asja ja varalise õiguse vastu lg. 1. NA AVALIK_KOHT,KAUPLUS Harju maakond Tallinn Nõmme linnaosa 6582000-6582499 540000-540499 VT

Unfortunately the table is in Estonian, but hopefully we manage to analyse it! First we translate the column names to English:

crime_names <- colnames(crime_data)

crime_names_en <-c("CaseId", 
                   "Date", 
                   "Time", 
                   "Weekday", 
                   "CaseType", 
                   "CaseTypeAdditional", 
                   "Law", 
                   "Paragraph", 
                   "ParagraphFull", 
                   "Section", 
                   "DamagesEuro", 
                   "PlaceType", 
                   "County", 
                   "Municipality", 
                   "Place", 
                   "Lest_X", 
                   "Lest_Y", 
                   "Type")

data_frame("Columns, et" = crime_names, "Columns, en" = crime_names_en) %>% 
  kable()
Columns, et Columns, en
JuhtumId CaseId
ToimKpv Date
ToimKell Time
ToimNadalapaev Weekday
SyndmusLiik CaseType
SyndmusTaiendavStatLiik CaseTypeAdditional
Seadus Law
Paragrahv Paragraph
ParagrahvTais ParagraphFull
Loige Section
Kahjusumma DamagesEuro
KohtLiik PlaceType
MaakondNimetus County
ValdLinnNimetus Municipality
KohtNimetus Place
Lest_X Lest_X
Lest_Y Lest_Y
SyyteoLiik Type

Replace column names and select relevant (more interesting variables):

colnames(crime_data) <- crime_names_en
glimpse(crime_data)
## Rows: 9,893
## Columns: 18
## $ CaseId             <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date               <chr> "2022-11-15", "2022-11-15", "2022-11-15", "2022-11-~
## $ Time               <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday            <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType           <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law                <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph          <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull      <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section            <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro        <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType          <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County             <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality       <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place              <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ Lest_X             <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y             <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ Type               <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~

To check what variable categories are available we can use command distinct():

LIST_caseType <- crime_data %>% 
  distinct(CaseType)

LIST_caseType %>% 
  head() %>% 
  kable()
CaseType
PISIVARGUS
VARGUS
PISIVARGUS,VARGUS
VANDALISM
LITTER,PISIVARGUS
AVALIKU_KORRA_RIKKUMINE

Next we should check the frequency of different categories:

TOP_caseType <- crime_data %>% 
  group_by(CaseType) %>% 
  summarise(n=n()) %>% 
  arrange(desc(n))

TOP_caseType %>% 
  head()
## # A tibble: 6 x 2
##   CaseType                     n
##   <chr>                    <int>
## 1 VARGUS                    4359
## 2 PISIVARGUS                2560
## 3 AVALIKU_KORRA_RIKKUMINE    750
## 4 VANDALISM                  636
## 5 JALGRATTA_MOPEEDI_VARGUS   349
## 6 MUU                        179

As we see once again, the working language of the Estonian police is Estonian. But we can translate the top 5 crime types easily to English:

crime type, et link to google translate
VARGUS https://translate.google.com/#et/en/vargus
PISIVARGUS https://translate.google.com/#et/en/pisivargus
AVALIKU_KORRA_RIKKUMINE https://translate.google.com/#et/en/avaliku_korra_rikkumine
VANDALISM https://translate.google.com/#et/en/vandalism
JALGRATTA_MOPEEDI_VARGUS https://translate.google.com/#et/en/jalgratta_mopeedi_vargus

General overview of crimes

Traditional glimpse at the data:

glimpse(crime_data)
## Rows: 9,893
## Columns: 18
## $ CaseId             <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date               <chr> "2022-11-15", "2022-11-15", "2022-11-15", "2022-11-~
## $ Time               <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday            <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType           <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law                <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph          <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull      <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section            <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro        <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType          <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County             <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality       <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place              <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ Lest_X             <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y             <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ Type               <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~

As it appears all the variables are imported as character. We have to convert columns to proper format. First we convert date, calculate weekdays and also time in hours:

# R gives the weekdays in language defined in your computer settings
# Let's switch to english:
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
crime_data <- crime_data %>% 
  mutate(Date = ymd(Date), 
         wday = lubridate::wday(Date, abbr = T, label =T), 
         hour = as.integer(substr(Time, 1, 2)))

# change back to your language:
Sys.setlocale("LC_TIME","Estonian") # in current case it's estonian
## [1] "Estonian_Estonia.1257"
glimpse(crime_data)
## Rows: 9,893
## Columns: 20
## $ CaseId             <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date               <date> 2022-11-15, 2022-11-15, 2022-11-15, 2022-11-15, 20~
## $ Time               <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday            <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType           <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law                <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph          <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull      <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section            <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro        <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType          <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County             <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality       <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place              <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ Lest_X             <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y             <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ Type               <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~
## $ wday               <ord> Tue, Tue, Tue, Tue, Tue, Tue, Tue, Mon, Mon, Mon, M~
## $ hour               <int> 21, 21, 15, 14, 12, 10, 1, 23, 19, 17, 16, 13, 12, ~

Crimes dataset also contains spatial information (L-EST97; epsg = 3301). Currently the coordinates are stored in character field and given as range (coordinates are gridded). Our task is to convert the pixel/cell extent to centroid:

crime_data <- crime_data %>% 
  mutate(Lest_X_bu = Lest_X, Lest_Y_bu = Lest_Y) # back up coordinate field, because 'separate' will delete the original field

crime_data <- crime_data %>% 
  separate(Lest_X, c("x_min", "x_max"), sep = "-") # split column

crime_data <- crime_data %>% 
  separate(Lest_Y, c("y_min", "y_max"), sep = "-")

crime_data <- crime_data %>% 
  mutate(x_min = as.integer(x_min),
         x_max = as.integer(x_max),
         y_min = as.integer(y_min),
         y_max = as.integer(y_max))

# calculate centroid:
crime_data <- crime_data %>% 
  mutate(x=(x_min + x_max) / 2, y=(y_min + y_max) / 2)

glimpse(crime_data)
## Rows: 9,893
## Columns: 26
## $ CaseId             <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date               <date> 2022-11-15, 2022-11-15, 2022-11-15, 2022-11-15, 20~
## $ Time               <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday            <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType           <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law                <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph          <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull      <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section            <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro        <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType          <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County             <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality       <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place              <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ x_min              <int> 6593000, 6589000, 6587000, 6587000, 6587500, 658200~
## $ x_max              <int> 6593999, 6589499, 6587499, 6587499, 6587999, 658249~
## $ y_min              <int> 557000, 542500, 545000, 545000, 537000, 540000, 544~
## $ y_max              <int> 557999, 542999, 545499, 545499, 537499, 540499, 544~
## $ Type               <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~
## $ wday               <ord> Tue, Tue, Tue, Tue, Tue, Tue, Tue, Mon, Mon, Mon, M~
## $ hour               <int> 21, 21, 15, 14, 12, 10, 1, 23, 19, 17, 16, 13, 12, ~
## $ Lest_X_bu          <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y_bu          <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ x                  <dbl> 6593500, 6589250, 6587250, 6587250, 6587750, 658225~
## $ y                  <dbl> 557499.5, 542749.5, 545249.5, 545249.5, 537249.5, 5~

Plot the distribution of crimes in Estonia:

ggplot()+
  geom_point(data = crime_data, aes(x=x, y=y), size=2, alpha=0.2)
## Warning: Removed 90 rows containing missing values (geom_point).

Well… It should remind us the contour of Estonia. But it does not!

Let’s download Estonian contour from Estonian Land Board:

download.file("https://geoportaal.maaamet.ee/docs/haldus_asustus/maakond_shp.zip", destfile="maakond_shp.zip")
#maakond means county!
unzip("maakond_shp.zip")

# correct name for the downloaded shp-layer:
list.files(pattern = ".shp")
##  [1] "asustusyksus_20211001.shp" "asustusyksus_20211101.shp"
##  [3] "asustusyksus_20221101.shp" "eestimaa_wgs84.shp"       
##  [5] "eestimaa_wgs84.shp.xml"    "gps_us.shp"               
##  [7] "gps_us_monterey.shp"       "maakond_20210901.shp"     
##  [9] "maakond_20211101.shp"      "maakond_20221001.shp"     
## [11] "maakond_20221101.shp"      "maakond_shp.zip"          
## [13] "omavalitsus_20211001.shp"  "omavalitsus_20221101.shp" 
## [15] "omavalitsus_shp.zip"       "population_2017.shp"      
## [17] "trt_cont.shp"
county <- st_read("maakond_20211101.shp")# read shp to R:
## Reading layer `maakond_20211101' from data source 
##   `C:\ANTO\loengud\geopythonR\rspatial_Git\rspatial\maakond_20211101.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 15 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 369032.1 ymin: 6377141 xmax: 739152.8 ymax: 6634019
## Projected CRS: Estonian Coordinate System of 1997
ggplot()+
  geom_sf(data = county)

# plotting was a bit slow, check the object size:
object.size(county)
## 17276056 bytes
# reduce the layer size:
county <- st_simplify(county, preserveTopology = T, dTolerance = 200) # ?st_simplify
object.size(county) # Smaller = faster!
## 1076304 bytes

Plot crimes data on top of county borders:

ggplot()+
  geom_sf(data = county)+
  geom_point(data = crime_data, aes(x=x, y=y), size=2, alpha=0.2, colour = "red")
## Warning: Removed 90 rows containing missing values (geom_point).

Well, looks like coordinate field names are switched in case of crimes dataset. Rename the coordinates columns to avoid mess:

# rename
crime_data <- crime_data %>% 
  rename(tmp = y) %>% 
  rename(y = x) %>% 
  rename(x = tmp)

# plot:
ggplot()+
  geom_sf(data = county, colour = "grey", fill="grey90", size=0.25)+
  geom_point(data = crime_data, aes(x = x, y = y), size = 0.5, alpha = 0.25, shape = 15, colour = "red")
## Warning: Removed 90 rows containing missing values (geom_point).

It worked! Spatial pattern of crimes correlates nicely with population density. We can reduce the dataset by aggregating it to grid:

glimpse(crime_data)
## Rows: 9,893
## Columns: 26
## $ CaseId             <chr> "5fe24546-c87c-18da-848a-e723ecc8c977", "5fe24550-c~
## $ Date               <date> 2022-11-15, 2022-11-15, 2022-11-15, 2022-11-15, 20~
## $ Time               <chr> "21:28", "21:00", "15:05", "14:25", "12:07", "10:54~
## $ Weekday            <chr> "Teisipäev", "Teisipäev", "Teisipäev", "Teisipäev",~
## $ CaseType           <chr> "PISIVARGUS", "VARGUS", "PISIVARGUS", "PISIVARGUS",~
## $ CaseTypeAdditional <chr> NA, "MOBIILTELEFONIVARGUS,TASKUVARGUS", NA, NA, "MU~
## $ Law                <chr> "Karistusseadustik", "Karistusseadustik", "Karistus~
## $ Paragraph          <chr> "§ 218.", "§ 199.", "§ 218.", "§ 218.", "§ 199.", "~
## $ ParagraphFull      <chr> "§ 218. Varavastane süütegu väheväärtusliku asja ja~
## $ Section            <chr> "lg. 1.", "lg. 1.", "lg. 1.", "lg. 1.", "lg. 2.", "~
## $ DamagesEuro        <chr> "0-499", NA, "0-499", "0-499", "0-499", NA, "0-499"~
## $ PlaceType          <chr> "AVALIK_KOHT,KAUPLUS", "AVALIK_KOHT,KASIINO", "AVAL~
## $ County             <chr> "Harju maakond", "Harju maakond", "Harju maakond", ~
## $ Municipality       <chr> "Maardu linn", "Tallinn", "Tallinn", "Tallinn", "Ta~
## $ Place              <chr> "Maardu linn", "Kesklinna linnaosa", "Lasnamäe linn~
## $ x_min              <int> 6593000, 6589000, 6587000, 6587000, 6587500, 658200~
## $ x_max              <int> 6593999, 6589499, 6587499, 6587499, 6587999, 658249~
## $ y_min              <int> 557000, 542500, 545000, 545000, 537000, 540000, 544~
## $ y_max              <int> 557999, 542999, 545499, 545499, 537499, 540499, 544~
## $ Type               <chr> "VT", "KT", "VT", "VT", "KT", "VT", "VT", "VT", "VT~
## $ wday               <ord> Tue, Tue, Tue, Tue, Tue, Tue, Tue, Mon, Mon, Mon, M~
## $ hour               <int> 21, 21, 15, 14, 12, 10, 1, 23, 19, 17, 16, 13, 12, ~
## $ Lest_X_bu          <chr> "6593000-6593999", "6589000-6589499", "6587000-6587~
## $ Lest_Y_bu          <chr> "557000-557999", "542500-542999", "545000-545499", ~
## $ y                  <dbl> 6593500, 6589250, 6587250, 6587250, 6587750, 658225~
## $ x                  <dbl> 557499.5, 542749.5, 545249.5, 545249.5, 537249.5, 5~
crime_data_grd_aggr <-  crime_data %>% 
  group_by(x, y) %>% 
  summarise(n = n()) %>% 
  ungroup()
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
ggplot()+
  geom_sf(data = county, colour = "grey", fill="grey90", size=0.25)+
  geom_point(data = crime_data_grd_aggr, aes(x = x, y = y, colour = n, alpha = n))+
  scale_colour_gradientn(colours = c("black", "red", "orange", "yellow"))

Interactive map could give a better overview? Try!

Convert data frame to sf-object:

crime_data_grd_aggr_sf <- st_as_sf(crime_data_grd_aggr, coords = c("x", "y"), crs = 3301)

Not working? Why? REad the Error message and solve it!

tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(county)+
  tm_polygons(col = "firebrick", border.col = "grey30", lwd = 0.2, alpha = 0.1)+
tm_shape(crime_data_grd_aggr_sf)+
  tm_dots("n", palette = c("pink", "red", "black"), border.lwd = 0.1, style = "log10_pretty")

The general picture remained the same but the dataset is 10x smaller (9893 rows vs 972 rows). The only problem with the previous plot is that the cells with large values (crime hotspots) are buried under other points.
Can we change the plotting order (bigger values on top)? Yes! We can sort rows by value:

crime_data_grd_aggr <- crime_data_grd_aggr %>% 
  arrange(n)

ggplot()+
  geom_sf(data = county, colour = "grey", fill="grey90", size=0.25)+
  geom_point(data = crime_data_grd_aggr, aes(x = x, y = y, colour = n))+
  scale_colour_gradientn(colours = c("black", "red", "orange", "yellow"))

Temporal dynamics of crimes

Group data by weekdays and hours, count number of crimes by groups. Create a plot:

crime_data_aggr_hourWday <- crime_data %>% 
  group_by(wday, hour) %>% 
  summarise(n=n()) %>% 
  ungroup()
## `summarise()` has grouped output by 'wday'. You can override using the
## `.groups` argument.
ggplot()+
  theme_minimal()+
  geom_line(data = crime_data_aggr_hourWday, aes(x = hour, y=n))+
  facet_wrap(~wday)+
  scale_x_continuous(breaks = seq(0, 24, 3))

Result looks almost okay! Somehow R thinks, that week starts with weekend day (sunday). Therefore the first day of the week is Sunday. But we can change it! Let’s define Monday as the first day of the week:

crime_data %>% 
  distinct(wday)
## # A tibble: 7 x 1
##   wday 
##   <ord>
## 1 Tue  
## 2 Mon  
## 3 Sun  
## 4 Sat  
## 5 Fri  
## 6 Thu  
## 7 Wed
crime_data$wday <- factor(crime_data$wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))

Let’s focus on public order violation (avaliku korra rikkumine):

crime_data_publicOrder <- crime_data %>% 
  filter(CaseType == "AVALIKU_KORRA_RIKKUMINE")

crime_data_aggr_hourWday <- crime_data %>% 
  group_by(wday, hour) %>% 
  summarise(n=n()) %>% 
  ungroup()

ggplot()+
  theme_minimal()+
  geom_line(data = crime_data_aggr_hourWday, aes(x = hour, y=n))+
  facet_wrap(~wday)+
  scale_x_continuous(breaks = seq(0, 24, 3))

Line graph is not the best type, because the weekdays are not connected with each other. Make a bar plot:

ggplot()+
  theme_minimal()+
  geom_col(data = crime_data_aggr_hourWday, aes(x = hour, y=n),  fill="red")+
  facet_wrap(~wday)+
  scale_x_continuous(breaks = seq(0, 24, 3))

Many other possibilities:

ggplot()+
  theme_minimal()+
  geom_segment(data = crime_data_aggr_hourWday, aes(x = hour, xend=hour, y=0, yend = n), colour="red")+
  geom_point(data = crime_data_aggr_hourWday, aes(x = hour, y=n), colour="red")+
  facet_wrap(~wday)+
  scale_x_continuous(breaks = seq(0, 24, 3))

Hard to compare? Put all days on the same plot:

ggplot()+
  theme_minimal()+
  geom_line(data = crime_data_aggr_hourWday, aes(x = hour, y=n, group=wday, colour=wday))+
  scale_x_continuous(breaks = seq(0, 24, 3))+
  scale_colour_manual(values = c("dodgerblue", "green", "blue", "grey", "orange", "red", "magenta"))

Still not very beautiful. We can calculate the mean for workdays and for weekend:

crime_data_aggr_hourWeekend <- crime_data_aggr_hourWday %>% 
  mutate(weekend = ifelse(wday == "Sat" | wday == "Sun", "weekend", "workday"))

crime_data_aggr_hourWeekend <- crime_data_aggr_hourWeekend %>% 
  group_by(weekend, hour) %>% 
  summarise(n=mean(n)) %>% 
  ungroup()

ggplot()+
  theme_minimal()+
  geom_line(data = crime_data_aggr_hourWeekend, aes(x=hour, y=n, colour=weekend))+
  geom_point(data = crime_data_aggr_hourWeekend, aes(x=hour, y=n, colour=weekend), shape=21)+
  labs(title="Average dynamics of crimes during the day", caption ="author: A. Aasa")+
  scale_colour_manual(values = c("orange", "dodgerblue"))+
  scale_x_continuous(breaks = seq(0, 24, 3))

Crimes in Tartu?

We don’t have to stay on a national level, we can also “zoom in”. For example we can download base map for Tartu and put crimes on that map.

# Contour of Tartu:
download.file("http://aasa.ut.ee/Rspatial/data/trt_cont.zip", destfile = "trt_cont.zip")
unzip("trt_cont.zip")

trt_cont <- st_read("trt_cont.shp")# read shp to R:
## Reading layer `trt_cont' from data source 
##   `C:\ANTO\loengud\geopythonR\rspatial_Git\rspatial\trt_cont.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 4 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 655873.6 ymin: 6469758 xmax: 663717.9 ymax: 6477657
## Projected CRS: Estonian Coordinate System of 1997
st_crs(trt_cont) <- 3301

# use the sf object of crimes
glimpse(crime_data_grd_aggr_sf)
## Rows: 971
## Columns: 2
## $ n        <int> 1, 1, 1, 1, 2, 1, 1, 1, 5, 5, 1, 17, 2, 6, 5, 1, 6, 1, 1, 1, ~
## $ geometry <POINT [m]> POINT (389499.5 6533500), POINT (396499.5 6448500), POI~
# clip to Tartu:
crime_data_grd_aggr_sf_trt <- st_intersection(crime_data_grd_aggr_sf, trt_cont) # spatial join


ggplot()+
  theme_minimal()+
  theme(legend.position = "right")+
  geom_sf(data = trt_cont, fill="black")+
  geom_sf(data = crime_data_grd_aggr_sf_trt, aes(colour = n, size=n), shape=15)+
  scale_colour_gradientn(colours = c("darkgreen", "grey95", "orange"))

Author: Anto Aasa
Supervisors: Anto Aasa & Lika Zhvania
Geospatial analysis in R
LTOM.02.041
Last update: 2022-11-21 15:33:17