Sorting out world cities database

This data is purchased from www.worldcitiesdatabase.com. There are some errors in the instructions so this is the original version with my adjustments so that it works. These instructions are also specific to ‘how’ I wish to use the data in the future.

Extract the data:

Extract the zip. Note down the path of the extracted directory which will be used later. Lets say this path is /mysql – which contains city.dat, states.dat and country.dat file.

Creation of tables:

DROP TABLE IF EXISTS country;
DROP TABLE IF EXISTS states;
DROP TABLE IF EXISTS city ;
DROP TABLE IF EXISTS timezone;

create table country(country_code varchar(2), country_name varchar(255), continent varchar(20), dialingPrefix varchar(20));

create table states(country_code varchar(2),state_code varchar(20),state_name varchar(255));

create table city (country_code varchar(2), state_code varchar(20),city_name varchar(255),city_name_ascii varchar(255), latitude float DEFAULT 0, longitude float DEFAULT 0,timezoneid  varchar(255));

create table timezone(country_code varchar(2),timezoneid varchar(255), gmtoffset float, dstoffest float, rawoffset float);

Alter table definition to support utf8 using following command

ALTER TABLE country CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;
ALTER TABLE states CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;
ALTER TABLE city CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;
ALTER TABLE timezone CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;

Load in the data

Note: Original documentation says ‘LOAD DATA LOCAL INFILE’, this produces an error so just use LOAD DATA INFILE as below

LOAD DATA INFILE '/xampp/mysql/country.dat' INTO TABLE country CHARACTER SET UTF8 fields terminated by '\t' LINES TERMINATED BY '\r\n'; 

LOAD DATA INFILE '/xampp/mysql/states.dat' INTO TABLE states CHARACTER SET UTF8 fields terminated by '\t' LINES TERMINATED BY '\r\n';

LOAD DATA INFILE '/xampp/mysql/city.dat' INTO TABLE city CHARACTER SET UTF8 fields terminated by '\t' LINES TERMINATED BY '\r\n'; 

LOAD DATA INFILE '/xampp/mysql/timezone.dat' INTO TABLE timezone CHARACTER SET UTF8 fields terminated by '\t' LINES TERMINATED BY '\r\n'; 

Corrections in the database (places I’ve come across that just aren’t there)

In the ‘city’ table, there appears to be no entry for

Mykolaiv – in the Mykolaiv region (state_code 16) – in the UKRAINE, so this needs to be manually added as follows;

INSERT INTO city (country_code, state_code, city_name, city_name_ascii, latitude, longitude, timezoneid) VALUES ("UA","16", "Mykolaiv", "Mykolaiv", "46.975033","31.994583","");

create table city (country_code varchar(2), state_code varchar(20),city_name varchar(255),city_name_ascii varchar(255), latitude float DEFAULT 0, longitude float DEFAULT 0,timezoneid varchar(255));

Creation of a ‘just europe’ database

DROP TABLE IF EXISTS eurocountry;
DROP TABLE IF EXISTS eurostates;
DROP TABLE IF EXISTS eurocity;
DROP TABLE IF EXISTS eurotimezone;

create table eurocountry(country_code varchar(2), country_name varchar(255), continent varchar(20), dialingPrefix varchar(20));

create table eurostates(country_code varchar(2),state_code varchar(20),state_name varchar(255));

create table eurocity (country_code varchar(2), state_code varchar(20),city_name varchar(255),city_name_ascii varchar(255), latitude float DEFAULT 0, longitude float DEFAULT 0,timezoneid  varchar(255));

create table eurotimezone(country_code varchar(2),timezoneid varchar(255), gmtoffset float, dstoffest float, rawoffset float);

ALTER TABLE eurocountry CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;
ALTER TABLE eurostates CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;
ALTER TABLE eurocity CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;
ALTER TABLE eurotimezone CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;

INSERT INTO eurocountry SELECT * FROM country WHERE country_code="AD"  OR country_code="AL" OR country_code="AT" OR country_code="AX" OR country_code="BA" OR country_code="BE" OR country_code="BG" OR country_code="BY" OR country_code="CH" OR country_code="CY" OR country_code="CZ" OR country_code="DE" OR country_code="DK" OR country_code="EE" OR country_code="ES" OR country_code="FI" OR country_code="FO" OR country_code="FR" OR country_code="GB" OR country_code="GG" OR country_code="GI" OR country_code="GR" OR country_code="HR" OR country_code="HU" OR country_code="IE" OR country_code="IM" OR country_code="IS" OR country_code="IT" OR country_code="JE" OR country_code="XK" OR country_code="LI" OR country_code="LT" OR country_code="LU" OR country_code="LV" OR country_code="MC" OR country_code="MD" OR country_code="ME" OR country_code="MK" OR country_code="MT" OR country_code="NL" OR country_code="NO" OR country_code="PL" OR country_code="PT" OR country_code="RO" OR country_code="RS" OR country_code="RU" OR country_code="SE" OR country_code="SI" OR country_code="SJ" OR country_code="SK" OR country_code="SM" OR country_code="TR" OR country_code="UA";

INSERT INTO eurostates SELECT * FROM states WHERE country_code="AD" OR country_code="AL" OR country_code="AT" OR country_code="AX" OR country_code="BA" OR country_code="BE" OR country_code="BG" OR country_code="BY" OR country_code="CH" OR country_code="CY" OR country_code="CZ" OR country_code="DE" OR country_code="DK" OR country_code="EE" OR country_code="ES" OR country_code="FI" OR country_code="FO" OR country_code="FR" OR country_code="GB" OR country_code="GG" OR country_code="GI" OR country_code="GR" OR country_code="HR" OR country_code="HU" OR country_code="IE" OR country_code="IM" OR country_code="IS" OR country_code="IT" OR country_code="JE" OR country_code="XK" OR country_code="LI" OR country_code="LT" OR country_code="LU" OR country_code="LV" OR country_code="MC" OR country_code="MD" OR country_code="ME" OR country_code="MK" OR country_code="MT" OR country_code="NL" OR country_code="NO" OR country_code="PL" OR country_code="PT" OR country_code="RO" OR country_code="RS"  OR country_code="RU" OR country_code="SE" OR country_code="SI" OR country_code="SJ" OR country_code="SK" OR country_code="SM" OR country_code="TR" OR country_code="UA";

INSERT INTO eurocity SELECT * FROM city WHERE country_code="AD" OR country_code="AL" OR country_code="AT" OR country_code="AX" OR country_code="BA" OR country_code="BE" OR country_code="BG" OR country_code="BY" OR country_code="CH" OR country_code="CY" OR country_code="CZ" OR country_code="DE" OR country_code="DK" OR country_code="EE" OR country_code="ES" OR country_code="FI" OR country_code="FO" OR country_code="FR" OR country_code="GB" OR country_code="GG" OR country_code="GI" OR country_code="GR" OR country_code="HR" OR country_code="HU" OR country_code="IE" OR country_code="IM" OR country_code="IS" OR country_code="IT" OR country_code="JE" OR country_code="XK" OR country_code="LI" OR country_code="LT" OR country_code="LU" OR country_code="LV" OR country_code="MC" OR country_code="MD" OR country_code="ME" OR country_code="MK" OR country_code="MT" OR country_code="NL" OR country_code="NO" OR country_code="PL" OR country_code="PT" OR country_code="RO" OR country_code="RS" OR country_code="RU" OR country_code="SE" OR country_code="SI" OR country_code="SJ" OR country_code="SK" OR country_code="SM" OR country_code="TR" OR country_code="UA";

INSERT INTO eurotimezone SELECT * FROM timezone WHERE country_code="AD" OR country_code="AL" OR country_code="AT" OR country_code="AX" OR country_code="BA" OR country_code="BE" OR country_code="BG" OR country_code="BY" OR country_code="CH" OR country_code="CY" OR country_code="CZ" OR country_code="DE" OR country_code="DK" OR country_code="EE" OR country_code="ES" OR country_code="FI" OR country_code="FO" OR country_code="FR" OR country_code="GB" OR country_code="GG" OR country_code="GI" OR country_code="GR" OR country_code="HR" OR country_code="HU" OR country_code="IE" OR country_code="IM" OR country_code="IS" OR country_code="IT" OR country_code="JE" OR country_code="XK" OR country_code="LI" OR country_code="LT" OR country_code="LU" OR country_code="LV" OR country_code="MC" OR country_code="MD" OR country_code="ME" OR country_code="MK" OR country_code="MT" OR country_code="NL" OR country_code="NO" OR country_code="PL" OR country_code="PT" OR country_code="RO" OR country_code="RS" OR country_code="RU" OR country_code="SE" OR country_code="SI" OR country_code="SJ" OR country_code="SK" OR country_code="SM" OR country_code="TR" OR country_code="UA";

To speed up just Euro tables, index them

The following query:

SELECT city_name_ascii, country_code FROM eurocity WHERE country_code=”FR” AND state_code=76 AND city_name_ascii LIKE ‘Fa%’ ORDER BY CHAR_LENGTH(city_name_ascii) ASC LIMIT 10

takes 3.7 seconds

This needs some experimentation to decide what to index, but below is the syntax to index the fields in the tables

ALTER TABLE eurocity ADD INDEX(city_name_ascii);
ALTER TABLE eurocity ADD INDEX(country_code);
ALTER TABLE eurocity ADD INDEX(state_code);
ALTER TABLE eurostates ADD INDEX(country_code);
ALTER TABLE eurostates ADD INDEX(state_code);
ALTER TABLE eurocountry ADD INDEX(country_code);
ALTER TABLE eurocountry ADD INDEX(country_name);

The same query:

The below works well once proper indices are done 2020-09-22:

SELECT eurocity.city_name_ascii, eurostates.state_name, eurocountry.country_name, eurocity.latitude, eurocity.longitude
FROM eurocity
INNER JOIN eurostates ON eurocity.state_code=eurostates.state_code AND eurocity.country_code=eurostates.country_code
INNER JOIN eurocountry ON eurocity.country_code=eurocountry.country_code
WHERE eurocity.city_name_ascii LIKE ‘Fa%’ ORDER BY CHAR_LENGTH(eurocity.city_name_ascii) LIMIT 10

Using right joins, it appears to be even faster

SELECT eurocity.city_name_ascii, eurostates.state_name, eurocountry.country_name, eurocity.latitude, eurocity.longitude
FROM eurocity
RIGHT JOIN eurostates ON eurocity.state_code=eurostates.state_code AND eurocity.country_code=eurostates.country_code
RIGHT JOIN eurocountry ON eurocity.country_code=eurocountry.country_code
WHERE eurocity.city_name_ascii LIKE ‘Fag%’ ORDER BY CHAR_LENGTH(eurocity.city_name_ascii) LIMIT 10

Now takes 0.01 seconds after the fields have been indexed.

Attempting to create an autosuggestion system:

SELECT eurocity.city_name_ascii, eurostates.state_name, eurocountry.country_name, CONCAT(eurocity.city_name_ascii,' ,',eurostates.state_name,' ,',eurocountry.country_name) AS minnow FROM eurocity INNER JOIN eurostates ON eurocity.state_code=eurostates.state_code AND eurocity.country_code=eurostates.country_code INNER JOIN eurocountry ON eurocity.country_code=eurocountry.country_code WHERE CONCAT(eurocity.city_name_ascii,' ,',eurostates.state_name,' ,',eurocountry.country_name) LIKE 'Fa %' ORDER BY CHAR_LENGTH(CONCAT(eurocity.city_name_ascii,' ,',eurostates.state_name,' ,',eurocountry.country_name)) LIMIT 10

The above is VERY slow, we are doing several concatenations in the select query.

To try and remedy this, we’ll create a new table that allows us to search it quicker. Effectively we’ll do all the concatenations, and put this information into a new table, and it’s this new table that we’ll actually be querying.

DROP TABLE IF EXISTS eurofullsearch;
 
create table eurofullsearch(
city_name_ascii varchar(255), 
state_name varchar(20), 
country_name varchar(20), 
latitude float, 
longitude float, 
city_state_country varchar (255));
ALTER TABLE eurofullsearch CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;

Note: city_state_country is a field which is a concatenation for the city, state, and country. So full code is:

DROP TABLE IF EXISTS eurofullsearch;
 
create table eurofullsearch(
city_name_ascii varchar(255), 
state_name varchar(20), 
country_name varchar(20), 
latitude float, 
longitude float);

ALTER TABLE eurofullsearch CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;

INSERT INTO eurofullsearch SELECT eurocity.city_name_ascii, eurostates.state_name, eurocountry.country_name, eurocity.latitude, eurocity.longitude FROM eurocity INNER JOIN eurostates ON eurocity.state_code=eurostates.state_code AND eurocity.country_code=eurostates.country_code INNER JOIN eurocountry ON eurocity.country_code=eurocountry.country_code 

Now let’s try a query;

SELECT minnow FROM eurofullsearch WHERE minnow LIKE 'Fa %' ORDER BY CHAR_LENGTH(minnow) LIMIT 10

Not bad, 2 seconds, and this is speeded up by indexing ‘minnow’ in the eurofullsearch table;

ALTER TABLE eurofullsearch ADD INDEX(minnow) 

Creation of new column which holds first letter of another column

ALTER TABLE  eurofullsearch ADD firstLetter char(1);

// substring (column name, startpoint, numberofCharactersFromStartPoint)

UPDATE eurofullsearch SET firstLetter = SUBSTRING(minnow, 1, 1);

ALTER TABLE eurofullsearch ADD INDEX (firstLetter);


2020-09-23

The new code for the creation of eurofullsearch is:

DROP TABLE IF EXISTS eurofullsearch;
 
create table eurofullsearch(
city_name_ascii varchar(255), 
state_name varchar(255), 
country_name varchar(255), 
latitude float, 
longitude float);

ALTER TABLE eurofullsearch CONVERT TO CHARACTER SET utf8 COLLATE utf8_persian_ci;

INSERT INTO eurofullsearch SELECT eurocity.city_name_ascii, eurostates.state_name, eurocountry.country_name, eurocity.latitude, eurocity.longitude FROM eurocity INNER JOIN eurostates ON eurocity.state_code=eurostates.state_code AND eurocity.country_code=eurostates.country_code INNER JOIN eurocountry ON eurocity.country_code=eurocountry.country_code; 

ALTER TABLE eurofullsearch ADD COLUMN id INT AUTO_INCREMENT UNIQUE NOT NULL;
ALTER TABLE eurofullsearch ADD INDEX(city_name_ascii); 

Leave a Reply