diff --git a/build_html.sh b/build_html.sh old mode 100644 new mode 100755 diff --git a/source/data/can_lang-meta-data.csv b/source/data/can_lang-meta-data.csv deleted file mode 100644 index 527d5cd6..00000000 --- a/source/data/can_lang-meta-data.csv +++ /dev/null @@ -1,430 +0,0 @@ -Source: Statistics Canada, Census of Population, 2016. Reproduced and distributed on an "as is" basis with the permission of Statistics Canada. -Date collected: 2020/07/09 -Aboriginal languages,"Aboriginal languages, n.o.s.",590,235,30,665 -Non-Official & Non-Aboriginal languages,Afrikaans,10260,4785,85,23415 -Non-Official & Non-Aboriginal languages,"Afro-Asiatic languages, n.i.e.",1150,445,10,2775 -Non-Official & Non-Aboriginal languages,Akan (Twi),13460,5985,25,22150 -Non-Official & Non-Aboriginal languages,Albanian,26895,13135,345,31930 -Aboriginal languages,"Algonquian languages, n.i.e.",45,10,0,120 -Aboriginal languages,Algonquin,1260,370,40,2480 -Non-Official & Non-Aboriginal languages,American Sign Language,2685,3020,1145,21930 -Non-Official & Non-Aboriginal languages,Amharic,22465,12785,200,33670 -Non-Official & Non-Aboriginal languages,Arabic,419890,223535,5585,629055 -Non-Official & Non-Aboriginal languages,Armenian,33460,21510,450,41295 -Non-Official & Non-Aboriginal languages,Assyrian Neo-Aramaic,16070,10510,205,19740 -Aboriginal languages,"Athabaskan languages, n.i.e.",50,10,0,85 -Aboriginal languages,Atikamekw,6150,5465,1100,6645 -Non-Official & Non-Aboriginal languages,"Austro-Asiatic languages, n.i.e",170,80,0,190 -Non-Official & Non-Aboriginal languages,"Austronesian languages, n.i.e.",4195,1160,35,5585 -Non-Official & Non-Aboriginal languages,Azerbaijani,3255,1245,25,5455 -Aboriginal languages,Babine (Wetsuwet'en),110,20,10,210 -Non-Official & Non-Aboriginal languages,Bamanankan,1535,345,0,3190 -Aboriginal languages,Beaver,190,50,0,340 -Non-Official & Non-Aboriginal languages,Belarusan,810,225,0,2265 -Non-Official & Non-Aboriginal languages,Bengali,73125,47350,525,91220 -Non-Official & Non-Aboriginal languages,"Berber languages, n.i.e.",8985,2615,15,12510 -Non-Official & Non-Aboriginal languages,Bikol,1785,290,0,2075 -Non-Official & Non-Aboriginal languages,Bilen,805,615,15,1085 -Aboriginal languages,Blackfoot,2815,1110,85,5645 -Non-Official & Non-Aboriginal languages,Bosnian,12215,6045,155,18265 -Non-Official & Non-Aboriginal languages,Bulgarian,20020,11985,200,22425 -Non-Official & Non-Aboriginal languages,Burmese,3585,2245,75,4995 -Non-Official & Non-Aboriginal languages,Cantonese,565270,400220,58820,699125 -Aboriginal languages,Carrier,1025,250,15,2100 -Non-Official & Non-Aboriginal languages,Catalan,870,350,30,2035 -Aboriginal languages,Cayuga,45,10,10,125 -Non-Official & Non-Aboriginal languages,Cebuano,19890,7205,70,27040 -Non-Official & Non-Aboriginal languages,"Celtic languages, n.i.e.",525,80,10,3595 -Non-Official & Non-Aboriginal languages,Chaldean Neo-Aramaic,5545,3445,35,7115 -Aboriginal languages,Chilcotin,655,255,15,1150 -Non-Official & Non-Aboriginal languages,"Chinese languages, n.i.e.",615,280,0,590 -Non-Official & Non-Aboriginal languages,"Chinese, n.o.s.",38580,23940,2935,41685 -Aboriginal languages,Comox,85,0,0,185 -Aboriginal languages,"Cree, n.o.s.",64050,37950,7800,86115 -Non-Official & Non-Aboriginal languages,"Creole languages, n.i.e.",4985,2005,15,16635 -Non-Official & Non-Aboriginal languages,"Creole, n.o.s.",64110,24570,310,133045 -Non-Official & Non-Aboriginal languages,Croatian,48200,16775,220,69835 -Non-Official & Non-Aboriginal languages,"Cushitic languages, n.i.e.",365,180,0,480 -Non-Official & Non-Aboriginal languages,Czech,22295,6235,70,28725 -Aboriginal languages,Dakota,1210,255,20,1760 -Non-Official & Non-Aboriginal languages,Danish,12630,855,85,15750 -Aboriginal languages,Dene,10700,7710,770,13060 -Non-Official & Non-Aboriginal languages,Dinka,2120,1130,0,2475 -Aboriginal languages,Dogrib (Tlicho),1650,1020,165,2375 -Non-Official & Non-Aboriginal languages,"Dravidian languages, n.i.e.",490,190,0,790 -Non-Official & Non-Aboriginal languages,Dutch,99015,9565,1165,120870 -Non-Official & Non-Aboriginal languages,Edo,1670,410,0,3220 -Official languages,English,19460850,22162865,15265335,29748265 -Non-Official & Non-Aboriginal languages,Estonian,5445,975,55,6070 -Non-Official & Non-Aboriginal languages,Ewe,1760,405,10,3000 -Non-Official & Non-Aboriginal languages,Fijian,745,195,0,1665 -Non-Official & Non-Aboriginal languages,Finnish,15295,2790,105,17590 -Official languages,French,7166700,6943800,3825215,10242945 -Non-Official & Non-Aboriginal languages,Frisian,2100,185,40,2910 -Non-Official & Non-Aboriginal languages,"Fulah (Pular, Pulaar, Fulfulde)",2825,825,0,4725 -Non-Official & Non-Aboriginal languages,Ga,920,250,0,2250 -Non-Official & Non-Aboriginal languages,Ganda,1295,345,25,2495 -Non-Official & Non-Aboriginal languages,Georgian,1710,1040,25,2150 -Non-Official & Non-Aboriginal languages,German,384040,120335,10065,502735 -Non-Official & Non-Aboriginal languages,"Germanic languages, n.i.e.",525,1630,725,8705 -Aboriginal languages,Gitxsan (Gitksan),880,315,10,1305 -Non-Official & Non-Aboriginal languages,Greek,106525,44550,1020,150965 -Non-Official & Non-Aboriginal languages,Gujarati,108780,64150,885,149045 -Aboriginal languages,Gwich'in,255,50,10,360 -Aboriginal languages,Haida,80,10,0,465 -Aboriginal languages,Haisla,90,20,0,175 -Non-Official & Non-Aboriginal languages,Haitian Creole,3030,1280,25,6855 -Non-Official & Non-Aboriginal languages,Hakka,10910,4085,70,12445 -Aboriginal languages,Halkomelem,480,50,20,1060 -Non-Official & Non-Aboriginal languages,Harari,1320,735,0,1715 -Non-Official & Non-Aboriginal languages,Hebrew,19530,8560,825,75020 -Aboriginal languages,Heiltsuk,100,5,10,125 -Non-Official & Non-Aboriginal languages,Hiligaynon,6880,2210,25,7925 -Non-Official & Non-Aboriginal languages,Hindi,110645,55510,1405,433365 -Non-Official & Non-Aboriginal languages,Hmong-Mien languages,795,335,10,870 -Non-Official & Non-Aboriginal languages,Hungarian,61235,19480,440,71285 -Non-Official & Non-Aboriginal languages,Icelandic,1285,270,0,1780 -Non-Official & Non-Aboriginal languages,Igbo,4235,1000,10,8855 -Non-Official & Non-Aboriginal languages,Ilocano,26345,9125,110,34530 -Non-Official & Non-Aboriginal languages,"Indo-Iranian languages, n.i.e.",5185,2380,20,8870 -Aboriginal languages,Inuinnaqtun (Inuvialuktun),1020,165,30,1975 -Aboriginal languages,"Inuit languages, n.i.e.",310,90,15,470 -Aboriginal languages,Inuktitut,35210,29230,8795,40620 -Aboriginal languages,"Iroquoian languages, n.i.e.",35,5,0,115 -Non-Official & Non-Aboriginal languages,Italian,375635,115415,1705,574725 -Non-Official & Non-Aboriginal languages,"Italic (Romance) languages, n.i.e.",720,175,25,2680 -Non-Official & Non-Aboriginal languages,Japanese,43640,19785,3255,83095 -Non-Official & Non-Aboriginal languages,Kabyle,13150,5490,15,17120 -Non-Official & Non-Aboriginal languages,Kannada,3970,1630,10,8245 -Non-Official & Non-Aboriginal languages,Karenic languages,4705,3860,135,4895 -Non-Official & Non-Aboriginal languages,Kashmiri,565,135,0,905 -Aboriginal languages,Kaska (Nahani),180,20,10,365 -Non-Official & Non-Aboriginal languages,Khmer (Cambodian),20130,10885,475,27035 -Non-Official & Non-Aboriginal languages,Kinyarwanda (Rwanda),5250,1530,25,7860 -Non-Official & Non-Aboriginal languages,Konkani,3330,720,10,6790 -Non-Official & Non-Aboriginal languages,Korean,153425,109705,12150,172750 -Non-Official & Non-Aboriginal languages,Kurdish,11705,6580,185,15290 -Aboriginal languages,Kutenai,110,10,0,170 -Aboriginal languages,Kwakiutl (Kwak'wala),325,25,15,605 -Non-Official & Non-Aboriginal languages,Lao,12670,6175,150,17235 -Non-Official & Non-Aboriginal languages,Latvian,5450,1255,35,6500 -Aboriginal languages,Lillooet,315,25,15,790 -Non-Official & Non-Aboriginal languages,Lingala,3805,1045,10,17010 -Non-Official & Non-Aboriginal languages,Lithuanian,7075,2015,60,8185 -Non-Official & Non-Aboriginal languages,Macedonian,16770,6830,95,23075 -Non-Official & Non-Aboriginal languages,Malagasy,1430,430,0,2340 -Non-Official & Non-Aboriginal languages,Malay,12275,3625,140,22470 -Non-Official & Non-Aboriginal languages,Malayalam,28565,15440,95,37810 -Aboriginal languages,Malecite,300,55,10,760 -Non-Official & Non-Aboriginal languages,Maltese,5565,1125,25,7625 -Non-Official & Non-Aboriginal languages,Mandarin,592040,462890,60090,814450 -Non-Official & Non-Aboriginal languages,Marathi,8295,3780,30,15565 -Aboriginal languages,Mi'kmaq,6690,3565,915,9025 -Aboriginal languages,Michif,465,80,10,1210 -Non-Official & Non-Aboriginal languages,Min Dong,1230,345,30,1045 -Non-Official & Non-Aboriginal languages,"Min Nan (Chaochow, Teochow, Fukien, Taiwanese)",31800,13965,565,42840 -Aboriginal languages,Mohawk,985,255,30,2415 -Non-Official & Non-Aboriginal languages,Mongolian,1575,905,10,2095 -Aboriginal languages,Montagnais (Innu),10235,8585,2055,11445 -Aboriginal languages,Moose Cree,105,10,0,195 -Aboriginal languages,Naskapi,1205,1195,370,1465 -Non-Official & Non-Aboriginal languages,Nepali,18275,13375,195,21385 -Non-Official & Non-Aboriginal languages,"Niger-Congo languages, n.i.e.",19135,4010,30,40760 -Non-Official & Non-Aboriginal languages,"Nilo-Saharan languages, n.i.e.",3750,1520,0,4550 -Aboriginal languages,Nisga'a,400,75,10,1055 -Aboriginal languages,North Slavey (Hare),765,340,95,1005 -Aboriginal languages,Northern East Cree,315,110,35,550 -Aboriginal languages,Northern Tutchone,220,30,0,280 -Non-Official & Non-Aboriginal languages,Norwegian,4615,350,70,8120 -Aboriginal languages,Nuu-chah-nulth (Nootka),280,30,10,560 -Aboriginal languages,Oji-Cree,12855,7905,1080,15605 -Aboriginal languages,Ojibway,17885,6175,765,28580 -Aboriginal languages,Okanagan,275,80,20,820 -Aboriginal languages,Oneida,60,15,0,185 -Non-Official & Non-Aboriginal languages,Oriya (Odia),1055,475,0,1530 -Non-Official & Non-Aboriginal languages,Oromo,4960,3410,45,6245 -Non-Official & Non-Aboriginal languages,"Other languages, n.i.e.",3685,1110,80,9730 -Aboriginal languages,Ottawa (Odawa),150,75,0,205 -Non-Official & Non-Aboriginal languages,"Pampangan (Kapampangan, Pampango)",4045,1200,10,5425 -Non-Official & Non-Aboriginal languages,Pangasinan,1390,240,0,1800 -Non-Official & Non-Aboriginal languages,Pashto,16905,10590,50,23180 -Non-Official & Non-Aboriginal languages,Persian (Farsi),214200,143025,4580,252325 -Aboriginal languages,Plains Cree,3065,1345,95,5905 -Non-Official & Non-Aboriginal languages,Polish,181710,74780,2495,214965 -Non-Official & Non-Aboriginal languages,Portuguese,221535,98710,7485,295955 -Non-Official & Non-Aboriginal languages,Punjabi (Panjabi),501680,349140,27865,668240 -Non-Official & Non-Aboriginal languages,Quebec Sign Language,695,730,130,4665 -Non-Official & Non-Aboriginal languages,Romanian,96660,53325,745,115050 -Non-Official & Non-Aboriginal languages,Rundi (Kirundi),5850,2110,0,8590 -Non-Official & Non-Aboriginal languages,Russian,188255,116595,4855,269645 -Aboriginal languages,"Salish languages, n.i.e.",260,25,0,560 -Aboriginal languages,Sarsi (Sarcee),80,10,0,145 -Non-Official & Non-Aboriginal languages,Scottish Gaelic,1090,190,15,3980 -Aboriginal languages,Sekani,85,15,0,185 -Non-Official & Non-Aboriginal languages,"Semitic languages, n.i.e.",2150,1205,65,3220 -Non-Official & Non-Aboriginal languages,Serbian,57350,31750,530,73780 -Non-Official & Non-Aboriginal languages,Serbo-Croatian,9550,3890,30,11275 -Non-Official & Non-Aboriginal languages,Shona,3185,1035,0,5430 -Aboriginal languages,Shuswap (Secwepemctsin),445,50,35,1305 -Non-Official & Non-Aboriginal languages,"Sign languages, n.i.e",4125,6690,645,22280 -Non-Official & Non-Aboriginal languages,Sindhi,11860,4975,35,20260 -Non-Official & Non-Aboriginal languages,Sinhala (Sinhalese),16335,7790,40,27825 -Aboriginal languages,"Siouan languages, n.i.e.",55,20,0,140 -Aboriginal languages,"Slavey, n.o.s.",280,105,10,675 -Non-Official & Non-Aboriginal languages,"Slavic languages, n.i.e.",2420,670,10,2995 -Non-Official & Non-Aboriginal languages,Slovak,17580,5610,100,21470 -Non-Official & Non-Aboriginal languages,Slovene (Slovenian),9785,2055,15,11490 -Non-Official & Non-Aboriginal languages,Somali,36755,22895,220,49660 -Aboriginal languages,South Slavey,945,370,35,1365 -Aboriginal languages,Southern East Cree,45,15,0,40 -Aboriginal languages,Southern Tutchone,70,5,0,145 -Non-Official & Non-Aboriginal languages,Spanish,458850,263505,13030,995260 -Aboriginal languages,Squamish,40,5,10,285 -Aboriginal languages,Stoney,3025,1950,240,3675 -Aboriginal languages,Straits,80,25,15,365 -Non-Official & Non-Aboriginal languages,Swahili,13370,5370,80,38685 -Aboriginal languages,Swampy Cree,1440,330,10,2350 -Non-Official & Non-Aboriginal languages,Swedish,6840,1050,125,14140 -Non-Official & Non-Aboriginal languages,"Tagalog (Pilipino, Filipino)",431385,213790,3450,612735 -Aboriginal languages,Tahltan,95,5,0,265 -Non-Official & Non-Aboriginal languages,"Tai-Kadai languages, n.i.e",85,30,0,115 -Non-Official & Non-Aboriginal languages,Tamil,140720,96955,2085,189860 -Non-Official & Non-Aboriginal languages,Telugu,15660,8280,40,23165 -Non-Official & Non-Aboriginal languages,Thai,9255,3365,525,15395 -Aboriginal languages,Thompson (Ntlakapamux),335,20,0,450 -Non-Official & Non-Aboriginal languages,Tibetan,6160,4590,50,7050 -Non-Official & Non-Aboriginal languages,"Tibeto-Burman languages, n.i.e.",1405,655,15,2380 -Non-Official & Non-Aboriginal languages,Tigrigna,16645,10205,130,21340 -Aboriginal languages,Tlingit,95,0,10,260 -Aboriginal languages,Tsimshian,200,30,10,410 -Non-Official & Non-Aboriginal languages,"Turkic languages, n.i.e.",1315,455,10,1875 -Non-Official & Non-Aboriginal languages,Turkish,32815,18955,690,50770 -Non-Official & Non-Aboriginal languages,Ukrainian,102485,28250,1210,132115 -Non-Official & Non-Aboriginal languages,"Uralic languages, n.i.e.",10,5,0,25 -Non-Official & Non-Aboriginal languages,Urdu,210815,128785,1495,322220 -Non-Official & Non-Aboriginal languages,Uyghur,1035,610,20,1390 -Non-Official & Non-Aboriginal languages,Uzbek,1720,995,15,2465 -Non-Official & Non-Aboriginal languages,Vietnamese,156430,104245,8075,198895 -Non-Official & Non-Aboriginal languages,Vlaams (Flemish),3895,355,35,4400 -Aboriginal languages,"Wakashan languages, n.i.e.",10,0,0,25 -Non-Official & Non-Aboriginal languages,Waray-Waray,1110,310,0,1395 -Non-Official & Non-Aboriginal languages,Welsh,1075,95,0,1695 -Non-Official & Non-Aboriginal languages,Wolof,3990,1385,10,8240 -Aboriginal languages,Woods Cree,1840,800,75,2665 -Non-Official & Non-Aboriginal languages,Wu (Shanghainese),12915,7650,105,16530 -Non-Official & Non-Aboriginal languages,Yiddish,13555,7085,895,20985 -Non-Official & Non-Aboriginal languages,Yoruba,9080,2615,15,22415 -Aboriginal languages,"Aboriginal languages, n.o.s.",590,235,30,665 -Non-Official & Non-Aboriginal languages,Afrikaans,10260,4785,85,23415 -Non-Official & Non-Aboriginal languages,"Afro-Asiatic languages, n.i.e.",1150,445,10,2775 -Non-Official & Non-Aboriginal languages,Akan (Twi),13460,5985,25,22150 -Non-Official & Non-Aboriginal languages,Albanian,26895,13135,345,31930 -Aboriginal languages,"Algonquian languages, n.i.e.",45,10,0,120 -Aboriginal languages,Algonquin,1260,370,40,2480 -Non-Official & Non-Aboriginal languages,American Sign Language,2685,3020,1145,21930 -Non-Official & Non-Aboriginal languages,Amharic,22465,12785,200,33670 -Non-Official & Non-Aboriginal languages,Arabic,419890,223535,5585,629055 -Non-Official & Non-Aboriginal languages,Armenian,33460,21510,450,41295 -Non-Official & Non-Aboriginal languages,Assyrian Neo-Aramaic,16070,10510,205,19740 -Aboriginal languages,"Athabaskan languages, n.i.e.",50,10,0,85 -Aboriginal languages,Atikamekw,6150,5465,1100,6645 -Non-Official & Non-Aboriginal languages,"Austro-Asiatic languages, n.i.e",170,80,0,190 -Non-Official & Non-Aboriginal languages,"Austronesian languages, n.i.e.",4195,1160,35,5585 -Non-Official & Non-Aboriginal languages,Azerbaijani,3255,1245,25,5455 -Aboriginal languages,Babine (Wetsuwet'en),110,20,10,210 -Non-Official & Non-Aboriginal languages,Bamanankan,1535,345,0,3190 -Aboriginal languages,Beaver,190,50,0,340 -Non-Official & Non-Aboriginal languages,Belarusan,810,225,0,2265 -Non-Official & Non-Aboriginal languages,Bengali,73125,47350,525,91220 -Non-Official & Non-Aboriginal languages,"Berber languages, n.i.e.",8985,2615,15,12510 -Non-Official & Non-Aboriginal languages,Bikol,1785,290,0,2075 -Non-Official & Non-Aboriginal languages,Bilen,805,615,15,1085 -Aboriginal languages,Blackfoot,2815,1110,85,5645 -Non-Official & Non-Aboriginal languages,Bosnian,12215,6045,155,18265 -Non-Official & Non-Aboriginal languages,Bulgarian,20020,11985,200,22425 -Non-Official & Non-Aboriginal languages,Burmese,3585,2245,75,4995 -Non-Official & Non-Aboriginal languages,Cantonese,565270,400220,58820,699125 -Aboriginal languages,Carrier,1025,250,15,2100 -Non-Official & Non-Aboriginal languages,Catalan,870,350,30,2035 -Aboriginal languages,Cayuga,45,10,10,125 -Non-Official & Non-Aboriginal languages,Cebuano,19890,7205,70,27040 -Non-Official & Non-Aboriginal languages,"Celtic languages, n.i.e.",525,80,10,3595 -Non-Official & Non-Aboriginal languages,Chaldean Neo-Aramaic,5545,3445,35,7115 -Aboriginal languages,Chilcotin,655,255,15,1150 -Non-Official & Non-Aboriginal languages,"Chinese languages, n.i.e.",615,280,0,590 -Non-Official & Non-Aboriginal languages,"Chinese, n.o.s.",38580,23940,2935,41685 -Aboriginal languages,Comox,85,0,0,185 -Aboriginal languages,"Cree, n.o.s.",64050,37950,7800,86115 -Non-Official & Non-Aboriginal languages,"Creole languages, n.i.e.",4985,2005,15,16635 -Non-Official & Non-Aboriginal languages,"Creole, n.o.s.",64110,24570,310,133045 -Non-Official & Non-Aboriginal languages,Croatian,48200,16775,220,69835 -Non-Official & Non-Aboriginal languages,"Cushitic languages, n.i.e.",365,180,0,480 -Non-Official & Non-Aboriginal languages,Czech,22295,6235,70,28725 -Aboriginal languages,Dakota,1210,255,20,1760 -Non-Official & Non-Aboriginal languages,Danish,12630,855,85,15750 -Aboriginal languages,Dene,10700,7710,770,13060 -Non-Official & Non-Aboriginal languages,Dinka,2120,1130,0,2475 -Aboriginal languages,Dogrib (Tlicho),1650,1020,165,2375 -Non-Official & Non-Aboriginal languages,"Dravidian languages, n.i.e.",490,190,0,790 -Non-Official & Non-Aboriginal languages,Dutch,99015,9565,1165,120870 -Non-Official & Non-Aboriginal languages,Edo,1670,410,0,3220 -Official languages,English,19460850,22162865,15265335,29748265 -Non-Official & Non-Aboriginal languages,Estonian,5445,975,55,6070 -Non-Official & Non-Aboriginal languages,Ewe,1760,405,10,3000 -Non-Official & Non-Aboriginal languages,Fijian,745,195,0,1665 -Non-Official & Non-Aboriginal languages,Finnish,15295,2790,105,17590 -Official languages,French,7166700,6943800,3825215,10242945 -Non-Official & Non-Aboriginal languages,Frisian,2100,185,40,2910 -Non-Official & Non-Aboriginal languages,"Fulah (Pular, Pulaar, Fulfulde)",2825,825,0,4725 -Non-Official & Non-Aboriginal languages,Ga,920,250,0,2250 -Non-Official & Non-Aboriginal languages,Ganda,1295,345,25,2495 -Non-Official & Non-Aboriginal languages,Georgian,1710,1040,25,2150 -Non-Official & Non-Aboriginal languages,German,384040,120335,10065,502735 -Non-Official & Non-Aboriginal languages,"Germanic languages, n.i.e.",525,1630,725,8705 -Aboriginal languages,Gitxsan (Gitksan),880,315,10,1305 -Non-Official & Non-Aboriginal languages,Greek,106525,44550,1020,150965 -Non-Official & Non-Aboriginal languages,Gujarati,108780,64150,885,149045 -Aboriginal languages,Gwich'in,255,50,10,360 -Aboriginal languages,Haida,80,10,0,465 -Aboriginal languages,Haisla,90,20,0,175 -Non-Official & Non-Aboriginal languages,Haitian Creole,3030,1280,25,6855 -Non-Official & Non-Aboriginal languages,Hakka,10910,4085,70,12445 -Aboriginal languages,Halkomelem,480,50,20,1060 -Non-Official & Non-Aboriginal languages,Harari,1320,735,0,1715 -Non-Official & Non-Aboriginal languages,Hebrew,19530,8560,825,75020 -Aboriginal languages,Heiltsuk,100,5,10,125 -Non-Official & Non-Aboriginal languages,Hiligaynon,6880,2210,25,7925 -Non-Official & Non-Aboriginal languages,Hindi,110645,55510,1405,433365 -Non-Official & Non-Aboriginal languages,Hmong-Mien languages,795,335,10,870 -Non-Official & Non-Aboriginal languages,Hungarian,61235,19480,440,71285 -Non-Official & Non-Aboriginal languages,Icelandic,1285,270,0,1780 -Non-Official & Non-Aboriginal languages,Igbo,4235,1000,10,8855 -Non-Official & Non-Aboriginal languages,Ilocano,26345,9125,110,34530 -Non-Official & Non-Aboriginal languages,"Indo-Iranian languages, n.i.e.",5185,2380,20,8870 -Aboriginal languages,Inuinnaqtun (Inuvialuktun),1020,165,30,1975 -Aboriginal languages,"Inuit languages, n.i.e.",310,90,15,470 -Aboriginal languages,Inuktitut,35210,29230,8795,40620 -Aboriginal languages,"Iroquoian languages, n.i.e.",35,5,0,115 -Non-Official & Non-Aboriginal languages,Italian,375635,115415,1705,574725 -Non-Official & Non-Aboriginal languages,"Italic (Romance) languages, n.i.e.",720,175,25,2680 -Non-Official & Non-Aboriginal languages,Japanese,43640,19785,3255,83095 -Non-Official & Non-Aboriginal languages,Kabyle,13150,5490,15,17120 -Non-Official & Non-Aboriginal languages,Kannada,3970,1630,10,8245 -Non-Official & Non-Aboriginal languages,Karenic languages,4705,3860,135,4895 -Non-Official & Non-Aboriginal languages,Kashmiri,565,135,0,905 -Aboriginal languages,Kaska (Nahani),180,20,10,365 -Non-Official & Non-Aboriginal languages,Khmer (Cambodian),20130,10885,475,27035 -Non-Official & Non-Aboriginal languages,Kinyarwanda (Rwanda),5250,1530,25,7860 -Non-Official & Non-Aboriginal languages,Konkani,3330,720,10,6790 -Non-Official & Non-Aboriginal languages,Korean,153425,109705,12150,172750 -Non-Official & Non-Aboriginal languages,Kurdish,11705,6580,185,15290 -Aboriginal languages,Kutenai,110,10,0,170 -Aboriginal languages,Kwakiutl (Kwak'wala),325,25,15,605 -Non-Official & Non-Aboriginal languages,Lao,12670,6175,150,17235 -Non-Official & Non-Aboriginal languages,Latvian,5450,1255,35,6500 -Aboriginal languages,Lillooet,315,25,15,790 -Non-Official & Non-Aboriginal languages,Lingala,3805,1045,10,17010 -Non-Official & Non-Aboriginal languages,Lithuanian,7075,2015,60,8185 -Non-Official & Non-Aboriginal languages,Macedonian,16770,6830,95,23075 -Non-Official & Non-Aboriginal languages,Malagasy,1430,430,0,2340 -Non-Official & Non-Aboriginal languages,Malay,12275,3625,140,22470 -Non-Official & Non-Aboriginal languages,Malayalam,28565,15440,95,37810 -Aboriginal languages,Malecite,300,55,10,760 -Non-Official & Non-Aboriginal languages,Maltese,5565,1125,25,7625 -Non-Official & Non-Aboriginal languages,Mandarin,592040,462890,60090,814450 -Non-Official & Non-Aboriginal languages,Marathi,8295,3780,30,15565 -Aboriginal languages,Mi'kmaq,6690,3565,915,9025 -Aboriginal languages,Michif,465,80,10,1210 -Non-Official & Non-Aboriginal languages,Min Dong,1230,345,30,1045 -Non-Official & Non-Aboriginal languages,"Min Nan (Chaochow, Teochow, Fukien, Taiwanese)",31800,13965,565,42840 -Aboriginal languages,Mohawk,985,255,30,2415 -Non-Official & Non-Aboriginal languages,Mongolian,1575,905,10,2095 -Aboriginal languages,Montagnais (Innu),10235,8585,2055,11445 -Aboriginal languages,Moose Cree,105,10,0,195 -Aboriginal languages,Naskapi,1205,1195,370,1465 -Non-Official & Non-Aboriginal languages,Nepali,18275,13375,195,21385 -Non-Official & Non-Aboriginal languages,"Niger-Congo languages, n.i.e.",19135,4010,30,40760 -Non-Official & Non-Aboriginal languages,"Nilo-Saharan languages, n.i.e.",3750,1520,0,4550 -Aboriginal languages,Nisga'a,400,75,10,1055 -Aboriginal languages,North Slavey (Hare),765,340,95,1005 -Aboriginal languages,Northern East Cree,315,110,35,550 -Aboriginal languages,Northern Tutchone,220,30,0,280 -Non-Official & Non-Aboriginal languages,Norwegian,4615,350,70,8120 -Aboriginal languages,Nuu-chah-nulth (Nootka),280,30,10,560 -Aboriginal languages,Oji-Cree,12855,7905,1080,15605 -Aboriginal languages,Ojibway,17885,6175,765,28580 -Aboriginal languages,Okanagan,275,80,20,820 -Aboriginal languages,Oneida,60,15,0,185 -Non-Official & Non-Aboriginal languages,Oriya (Odia),1055,475,0,1530 -Non-Official & Non-Aboriginal languages,Oromo,4960,3410,45,6245 -Non-Official & Non-Aboriginal languages,"Other languages, n.i.e.",3685,1110,80,9730 -Aboriginal languages,Ottawa (Odawa),150,75,0,205 -Non-Official & Non-Aboriginal languages,"Pampangan (Kapampangan, Pampango)",4045,1200,10,5425 -Non-Official & Non-Aboriginal languages,Pangasinan,1390,240,0,1800 -Non-Official & Non-Aboriginal languages,Pashto,16905,10590,50,23180 -Non-Official & Non-Aboriginal languages,Persian (Farsi),214200,143025,4580,252325 -Aboriginal languages,Plains Cree,3065,1345,95,5905 -Non-Official & Non-Aboriginal languages,Polish,181710,74780,2495,214965 -Non-Official & Non-Aboriginal languages,Portuguese,221535,98710,7485,295955 -Non-Official & Non-Aboriginal languages,Punjabi (Panjabi),501680,349140,27865,668240 -Non-Official & Non-Aboriginal languages,Quebec Sign Language,695,730,130,4665 -Non-Official & Non-Aboriginal languages,Romanian,96660,53325,745,115050 -Non-Official & Non-Aboriginal languages,Rundi (Kirundi),5850,2110,0,8590 -Non-Official & Non-Aboriginal languages,Russian,188255,116595,4855,269645 -Aboriginal languages,"Salish languages, n.i.e.",260,25,0,560 -Aboriginal languages,Sarsi (Sarcee),80,10,0,145 -Non-Official & Non-Aboriginal languages,Scottish Gaelic,1090,190,15,3980 -Aboriginal languages,Sekani,85,15,0,185 -Non-Official & Non-Aboriginal languages,"Semitic languages, n.i.e.",2150,1205,65,3220 -Non-Official & Non-Aboriginal languages,Serbian,57350,31750,530,73780 -Non-Official & Non-Aboriginal languages,Serbo-Croatian,9550,3890,30,11275 -Non-Official & Non-Aboriginal languages,Shona,3185,1035,0,5430 -Aboriginal languages,Shuswap (Secwepemctsin),445,50,35,1305 -Non-Official & Non-Aboriginal languages,"Sign languages, n.i.e",4125,6690,645,22280 -Non-Official & Non-Aboriginal languages,Sindhi,11860,4975,35,20260 -Non-Official & Non-Aboriginal languages,Sinhala (Sinhalese),16335,7790,40,27825 -Aboriginal languages,"Siouan languages, n.i.e.",55,20,0,140 -Aboriginal languages,"Slavey, n.o.s.",280,105,10,675 -Non-Official & Non-Aboriginal languages,"Slavic languages, n.i.e.",2420,670,10,2995 -Non-Official & Non-Aboriginal languages,Slovak,17580,5610,100,21470 -Non-Official & Non-Aboriginal languages,Slovene (Slovenian),9785,2055,15,11490 -Non-Official & Non-Aboriginal languages,Somali,36755,22895,220,49660 -Aboriginal languages,South Slavey,945,370,35,1365 -Aboriginal languages,Southern East Cree,45,15,0,40 -Aboriginal languages,Southern Tutchone,70,5,0,145 -Non-Official & Non-Aboriginal languages,Spanish,458850,263505,13030,995260 -Aboriginal languages,Squamish,40,5,10,285 -Aboriginal languages,Stoney,3025,1950,240,3675 -Aboriginal languages,Straits,80,25,15,365 -Non-Official & Non-Aboriginal languages,Swahili,13370,5370,80,38685 -Aboriginal languages,Swampy Cree,1440,330,10,2350 -Non-Official & Non-Aboriginal languages,Swedish,6840,1050,125,14140 -Non-Official & Non-Aboriginal languages,"Tagalog (Pilipino, Filipino)",431385,213790,3450,612735 -Aboriginal languages,Tahltan,95,5,0,265 -Non-Official & Non-Aboriginal languages,"Tai-Kadai languages, n.i.e",85,30,0,115 -Non-Official & Non-Aboriginal languages,Tamil,140720,96955,2085,189860 -Non-Official & Non-Aboriginal languages,Telugu,15660,8280,40,23165 -Non-Official & Non-Aboriginal languages,Thai,9255,3365,525,15395 -Aboriginal languages,Thompson (Ntlakapamux),335,20,0,450 -Non-Official & Non-Aboriginal languages,Tibetan,6160,4590,50,7050 -Non-Official & Non-Aboriginal languages,"Tibeto-Burman languages, n.i.e.",1405,655,15,2380 -Non-Official & Non-Aboriginal languages,Tigrigna,16645,10205,130,21340 -Aboriginal languages,Tlingit,95,0,10,260 -Aboriginal languages,Tsimshian,200,30,10,410 -Non-Official & Non-Aboriginal languages,"Turkic languages, n.i.e.",1315,455,10,1875 -Non-Official & Non-Aboriginal languages,Turkish,32815,18955,690,50770 -Non-Official & Non-Aboriginal languages,Ukrainian,102485,28250,1210,132115 -Non-Official & Non-Aboriginal languages,"Uralic languages, n.i.e.",10,5,0,25 -Non-Official & Non-Aboriginal languages,Urdu,210815,128785,1495,322220 -Non-Official & Non-Aboriginal languages,Uyghur,1035,610,20,1390 -Non-Official & Non-Aboriginal languages,Uzbek,1720,995,15,2465 -Non-Official & Non-Aboriginal languages,Vietnamese,156430,104245,8075,198895 -Non-Official & Non-Aboriginal languages,Vlaams (Flemish),3895,355,35,4400 -Aboriginal languages,"Wakashan languages, n.i.e.",10,0,0,25 -Non-Official & Non-Aboriginal languages,Waray-Waray,1110,310,0,1395 -Non-Official & Non-Aboriginal languages,Welsh,1075,95,0,1695 -Non-Official & Non-Aboriginal languages,Wolof,3990,1385,10,8240 -Aboriginal languages,Woods Cree,1840,800,75,2665 -Non-Official & Non-Aboriginal languages,Wu (Shanghainese),12915,7650,105,16530 -Non-Official & Non-Aboriginal languages,Yiddish,13555,7085,895,20985 -Non-Official & Non-Aboriginal languages,Yoruba,9080,2615,15,22415 diff --git a/source/data/can_lang.tsv b/source/data/can_lang.tsv index d610ebe7..91659f1b 100644 --- a/source/data/can_lang.tsv +++ b/source/data/can_lang.tsv @@ -1,3 +1,4 @@ +category language mother_tongue most_at_home most_at_work lang_known Aboriginal languages Aboriginal languages, n.o.s. 590 235 30 665 Non-Official & Non-Aboriginal languages Afrikaans 10260 4785 85 23415 Non-Official & Non-Aboriginal languages Afro-Asiatic languages, n.i.e. 1150 445 10 2775 diff --git a/source/data/can_lang_meta-data.csv b/source/data/can_lang_meta-data.csv new file mode 100644 index 00000000..b4432f8f --- /dev/null +++ b/source/data/can_lang_meta-data.csv @@ -0,0 +1,218 @@ +Data source: https://ttimbers.github.io/canlang/ +Data originally published in: Statistics Canada Census of Population 2016. +Reproduced and distributed on an as-is basis with their permission. +category,language,mother_tongue,most_at_home,most_at_work,lang_known +Aboriginal languages,"Aboriginal languages, n.o.s.",590,235,30,665 +Non-Official & Non-Aboriginal languages,Afrikaans,10260,4785,85,23415 +Non-Official & Non-Aboriginal languages,"Afro-Asiatic languages, n.i.e.",1150,445,10,2775 +Non-Official & Non-Aboriginal languages,Akan (Twi),13460,5985,25,22150 +Non-Official & Non-Aboriginal languages,Albanian,26895,13135,345,31930 +Aboriginal languages,"Algonquian languages, n.i.e.",45,10,0,120 +Aboriginal languages,Algonquin,1260,370,40,2480 +Non-Official & Non-Aboriginal languages,American Sign Language,2685,3020,1145,21930 +Non-Official & Non-Aboriginal languages,Amharic,22465,12785,200,33670 +Non-Official & Non-Aboriginal languages,Arabic,419890,223535,5585,629055 +Non-Official & Non-Aboriginal languages,Armenian,33460,21510,450,41295 +Non-Official & Non-Aboriginal languages,Assyrian Neo-Aramaic,16070,10510,205,19740 +Aboriginal languages,"Athabaskan languages, n.i.e.",50,10,0,85 +Aboriginal languages,Atikamekw,6150,5465,1100,6645 +Non-Official & Non-Aboriginal languages,"Austro-Asiatic languages, n.i.e",170,80,0,190 +Non-Official & Non-Aboriginal languages,"Austronesian languages, n.i.e.",4195,1160,35,5585 +Non-Official & Non-Aboriginal languages,Azerbaijani,3255,1245,25,5455 +Aboriginal languages,Babine (Wetsuwet'en),110,20,10,210 +Non-Official & Non-Aboriginal languages,Bamanankan,1535,345,0,3190 +Aboriginal languages,Beaver,190,50,0,340 +Non-Official & Non-Aboriginal languages,Belarusan,810,225,0,2265 +Non-Official & Non-Aboriginal languages,Bengali,73125,47350,525,91220 +Non-Official & Non-Aboriginal languages,"Berber languages, n.i.e.",8985,2615,15,12510 +Non-Official & Non-Aboriginal languages,Bikol,1785,290,0,2075 +Non-Official & Non-Aboriginal languages,Bilen,805,615,15,1085 +Aboriginal languages,Blackfoot,2815,1110,85,5645 +Non-Official & Non-Aboriginal languages,Bosnian,12215,6045,155,18265 +Non-Official & Non-Aboriginal languages,Bulgarian,20020,11985,200,22425 +Non-Official & Non-Aboriginal languages,Burmese,3585,2245,75,4995 +Non-Official & Non-Aboriginal languages,Cantonese,565270,400220,58820,699125 +Aboriginal languages,Carrier,1025,250,15,2100 +Non-Official & Non-Aboriginal languages,Catalan,870,350,30,2035 +Aboriginal languages,Cayuga,45,10,10,125 +Non-Official & Non-Aboriginal languages,Cebuano,19890,7205,70,27040 +Non-Official & Non-Aboriginal languages,"Celtic languages, n.i.e.",525,80,10,3595 +Non-Official & Non-Aboriginal languages,Chaldean Neo-Aramaic,5545,3445,35,7115 +Aboriginal languages,Chilcotin,655,255,15,1150 +Non-Official & Non-Aboriginal languages,"Chinese languages, n.i.e.",615,280,0,590 +Non-Official & Non-Aboriginal languages,"Chinese, n.o.s.",38580,23940,2935,41685 +Aboriginal languages,Comox,85,0,0,185 +Aboriginal languages,"Cree, n.o.s.",64050,37950,7800,86115 +Non-Official & Non-Aboriginal languages,"Creole languages, n.i.e.",4985,2005,15,16635 +Non-Official & Non-Aboriginal languages,"Creole, n.o.s.",64110,24570,310,133045 +Non-Official & Non-Aboriginal languages,Croatian,48200,16775,220,69835 +Non-Official & Non-Aboriginal languages,"Cushitic languages, n.i.e.",365,180,0,480 +Non-Official & Non-Aboriginal languages,Czech,22295,6235,70,28725 +Aboriginal languages,Dakota,1210,255,20,1760 +Non-Official & Non-Aboriginal languages,Danish,12630,855,85,15750 +Aboriginal languages,Dene,10700,7710,770,13060 +Non-Official & Non-Aboriginal languages,Dinka,2120,1130,0,2475 +Aboriginal languages,Dogrib (Tlicho),1650,1020,165,2375 +Non-Official & Non-Aboriginal languages,"Dravidian languages, n.i.e.",490,190,0,790 +Non-Official & Non-Aboriginal languages,Dutch,99015,9565,1165,120870 +Non-Official & Non-Aboriginal languages,Edo,1670,410,0,3220 +Official languages,English,19460850,22162865,15265335,29748265 +Non-Official & Non-Aboriginal languages,Estonian,5445,975,55,6070 +Non-Official & Non-Aboriginal languages,Ewe,1760,405,10,3000 +Non-Official & Non-Aboriginal languages,Fijian,745,195,0,1665 +Non-Official & Non-Aboriginal languages,Finnish,15295,2790,105,17590 +Official languages,French,7166700,6943800,3825215,10242945 +Non-Official & Non-Aboriginal languages,Frisian,2100,185,40,2910 +Non-Official & Non-Aboriginal languages,"Fulah (Pular, Pulaar, Fulfulde)",2825,825,0,4725 +Non-Official & Non-Aboriginal languages,Ga,920,250,0,2250 +Non-Official & Non-Aboriginal languages,Ganda,1295,345,25,2495 +Non-Official & Non-Aboriginal languages,Georgian,1710,1040,25,2150 +Non-Official & Non-Aboriginal languages,German,384040,120335,10065,502735 +Non-Official & Non-Aboriginal languages,"Germanic languages, n.i.e.",525,1630,725,8705 +Aboriginal languages,Gitxsan (Gitksan),880,315,10,1305 +Non-Official & Non-Aboriginal languages,Greek,106525,44550,1020,150965 +Non-Official & Non-Aboriginal languages,Gujarati,108780,64150,885,149045 +Aboriginal languages,Gwich'in,255,50,10,360 +Aboriginal languages,Haida,80,10,0,465 +Aboriginal languages,Haisla,90,20,0,175 +Non-Official & Non-Aboriginal languages,Haitian Creole,3030,1280,25,6855 +Non-Official & Non-Aboriginal languages,Hakka,10910,4085,70,12445 +Aboriginal languages,Halkomelem,480,50,20,1060 +Non-Official & Non-Aboriginal languages,Harari,1320,735,0,1715 +Non-Official & Non-Aboriginal languages,Hebrew,19530,8560,825,75020 +Aboriginal languages,Heiltsuk,100,5,10,125 +Non-Official & Non-Aboriginal languages,Hiligaynon,6880,2210,25,7925 +Non-Official & Non-Aboriginal languages,Hindi,110645,55510,1405,433365 +Non-Official & Non-Aboriginal languages,Hmong-Mien languages,795,335,10,870 +Non-Official & Non-Aboriginal languages,Hungarian,61235,19480,440,71285 +Non-Official & Non-Aboriginal languages,Icelandic,1285,270,0,1780 +Non-Official & Non-Aboriginal languages,Igbo,4235,1000,10,8855 +Non-Official & Non-Aboriginal languages,Ilocano,26345,9125,110,34530 +Non-Official & Non-Aboriginal languages,"Indo-Iranian languages, n.i.e.",5185,2380,20,8870 +Aboriginal languages,Inuinnaqtun (Inuvialuktun),1020,165,30,1975 +Aboriginal languages,"Inuit languages, n.i.e.",310,90,15,470 +Aboriginal languages,Inuktitut,35210,29230,8795,40620 +Aboriginal languages,"Iroquoian languages, n.i.e.",35,5,0,115 +Non-Official & Non-Aboriginal languages,Italian,375635,115415,1705,574725 +Non-Official & Non-Aboriginal languages,"Italic (Romance) languages, n.i.e.",720,175,25,2680 +Non-Official & Non-Aboriginal languages,Japanese,43640,19785,3255,83095 +Non-Official & Non-Aboriginal languages,Kabyle,13150,5490,15,17120 +Non-Official & Non-Aboriginal languages,Kannada,3970,1630,10,8245 +Non-Official & Non-Aboriginal languages,Karenic languages,4705,3860,135,4895 +Non-Official & Non-Aboriginal languages,Kashmiri,565,135,0,905 +Aboriginal languages,Kaska (Nahani),180,20,10,365 +Non-Official & Non-Aboriginal languages,Khmer (Cambodian),20130,10885,475,27035 +Non-Official & Non-Aboriginal languages,Kinyarwanda (Rwanda),5250,1530,25,7860 +Non-Official & Non-Aboriginal languages,Konkani,3330,720,10,6790 +Non-Official & Non-Aboriginal languages,Korean,153425,109705,12150,172750 +Non-Official & Non-Aboriginal languages,Kurdish,11705,6580,185,15290 +Aboriginal languages,Kutenai,110,10,0,170 +Aboriginal languages,Kwakiutl (Kwak'wala),325,25,15,605 +Non-Official & Non-Aboriginal languages,Lao,12670,6175,150,17235 +Non-Official & Non-Aboriginal languages,Latvian,5450,1255,35,6500 +Aboriginal languages,Lillooet,315,25,15,790 +Non-Official & Non-Aboriginal languages,Lingala,3805,1045,10,17010 +Non-Official & Non-Aboriginal languages,Lithuanian,7075,2015,60,8185 +Non-Official & Non-Aboriginal languages,Macedonian,16770,6830,95,23075 +Non-Official & Non-Aboriginal languages,Malagasy,1430,430,0,2340 +Non-Official & Non-Aboriginal languages,Malay,12275,3625,140,22470 +Non-Official & Non-Aboriginal languages,Malayalam,28565,15440,95,37810 +Aboriginal languages,Malecite,300,55,10,760 +Non-Official & Non-Aboriginal languages,Maltese,5565,1125,25,7625 +Non-Official & Non-Aboriginal languages,Mandarin,592040,462890,60090,814450 +Non-Official & Non-Aboriginal languages,Marathi,8295,3780,30,15565 +Aboriginal languages,Mi'kmaq,6690,3565,915,9025 +Aboriginal languages,Michif,465,80,10,1210 +Non-Official & Non-Aboriginal languages,Min Dong,1230,345,30,1045 +Non-Official & Non-Aboriginal languages,"Min Nan (Chaochow, Teochow, Fukien, Taiwanese)",31800,13965,565,42840 +Aboriginal languages,Mohawk,985,255,30,2415 +Non-Official & Non-Aboriginal languages,Mongolian,1575,905,10,2095 +Aboriginal languages,Montagnais (Innu),10235,8585,2055,11445 +Aboriginal languages,Moose Cree,105,10,0,195 +Aboriginal languages,Naskapi,1205,1195,370,1465 +Non-Official & Non-Aboriginal languages,Nepali,18275,13375,195,21385 +Non-Official & Non-Aboriginal languages,"Niger-Congo languages, n.i.e.",19135,4010,30,40760 +Non-Official & Non-Aboriginal languages,"Nilo-Saharan languages, n.i.e.",3750,1520,0,4550 +Aboriginal languages,Nisga'a,400,75,10,1055 +Aboriginal languages,North Slavey (Hare),765,340,95,1005 +Aboriginal languages,Northern East Cree,315,110,35,550 +Aboriginal languages,Northern Tutchone,220,30,0,280 +Non-Official & Non-Aboriginal languages,Norwegian,4615,350,70,8120 +Aboriginal languages,Nuu-chah-nulth (Nootka),280,30,10,560 +Aboriginal languages,Oji-Cree,12855,7905,1080,15605 +Aboriginal languages,Ojibway,17885,6175,765,28580 +Aboriginal languages,Okanagan,275,80,20,820 +Aboriginal languages,Oneida,60,15,0,185 +Non-Official & Non-Aboriginal languages,Oriya (Odia),1055,475,0,1530 +Non-Official & Non-Aboriginal languages,Oromo,4960,3410,45,6245 +Non-Official & Non-Aboriginal languages,"Other languages, n.i.e.",3685,1110,80,9730 +Aboriginal languages,Ottawa (Odawa),150,75,0,205 +Non-Official & Non-Aboriginal languages,"Pampangan (Kapampangan, Pampango)",4045,1200,10,5425 +Non-Official & Non-Aboriginal languages,Pangasinan,1390,240,0,1800 +Non-Official & Non-Aboriginal languages,Pashto,16905,10590,50,23180 +Non-Official & Non-Aboriginal languages,Persian (Farsi),214200,143025,4580,252325 +Aboriginal languages,Plains Cree,3065,1345,95,5905 +Non-Official & Non-Aboriginal languages,Polish,181710,74780,2495,214965 +Non-Official & Non-Aboriginal languages,Portuguese,221535,98710,7485,295955 +Non-Official & Non-Aboriginal languages,Punjabi (Panjabi),501680,349140,27865,668240 +Non-Official & Non-Aboriginal languages,Quebec Sign Language,695,730,130,4665 +Non-Official & Non-Aboriginal languages,Romanian,96660,53325,745,115050 +Non-Official & Non-Aboriginal languages,Rundi (Kirundi),5850,2110,0,8590 +Non-Official & Non-Aboriginal languages,Russian,188255,116595,4855,269645 +Aboriginal languages,"Salish languages, n.i.e.",260,25,0,560 +Aboriginal languages,Sarsi (Sarcee),80,10,0,145 +Non-Official & Non-Aboriginal languages,Scottish Gaelic,1090,190,15,3980 +Aboriginal languages,Sekani,85,15,0,185 +Non-Official & Non-Aboriginal languages,"Semitic languages, n.i.e.",2150,1205,65,3220 +Non-Official & Non-Aboriginal languages,Serbian,57350,31750,530,73780 +Non-Official & Non-Aboriginal languages,Serbo-Croatian,9550,3890,30,11275 +Non-Official & Non-Aboriginal languages,Shona,3185,1035,0,5430 +Aboriginal languages,Shuswap (Secwepemctsin),445,50,35,1305 +Non-Official & Non-Aboriginal languages,"Sign languages, n.i.e",4125,6690,645,22280 +Non-Official & Non-Aboriginal languages,Sindhi,11860,4975,35,20260 +Non-Official & Non-Aboriginal languages,Sinhala (Sinhalese),16335,7790,40,27825 +Aboriginal languages,"Siouan languages, n.i.e.",55,20,0,140 +Aboriginal languages,"Slavey, n.o.s.",280,105,10,675 +Non-Official & Non-Aboriginal languages,"Slavic languages, n.i.e.",2420,670,10,2995 +Non-Official & Non-Aboriginal languages,Slovak,17580,5610,100,21470 +Non-Official & Non-Aboriginal languages,Slovene (Slovenian),9785,2055,15,11490 +Non-Official & Non-Aboriginal languages,Somali,36755,22895,220,49660 +Aboriginal languages,South Slavey,945,370,35,1365 +Aboriginal languages,Southern East Cree,45,15,0,40 +Aboriginal languages,Southern Tutchone,70,5,0,145 +Non-Official & Non-Aboriginal languages,Spanish,458850,263505,13030,995260 +Aboriginal languages,Squamish,40,5,10,285 +Aboriginal languages,Stoney,3025,1950,240,3675 +Aboriginal languages,Straits,80,25,15,365 +Non-Official & Non-Aboriginal languages,Swahili,13370,5370,80,38685 +Aboriginal languages,Swampy Cree,1440,330,10,2350 +Non-Official & Non-Aboriginal languages,Swedish,6840,1050,125,14140 +Non-Official & Non-Aboriginal languages,"Tagalog (Pilipino, Filipino)",431385,213790,3450,612735 +Aboriginal languages,Tahltan,95,5,0,265 +Non-Official & Non-Aboriginal languages,"Tai-Kadai languages, n.i.e",85,30,0,115 +Non-Official & Non-Aboriginal languages,Tamil,140720,96955,2085,189860 +Non-Official & Non-Aboriginal languages,Telugu,15660,8280,40,23165 +Non-Official & Non-Aboriginal languages,Thai,9255,3365,525,15395 +Aboriginal languages,Thompson (Ntlakapamux),335,20,0,450 +Non-Official & Non-Aboriginal languages,Tibetan,6160,4590,50,7050 +Non-Official & Non-Aboriginal languages,"Tibeto-Burman languages, n.i.e.",1405,655,15,2380 +Non-Official & Non-Aboriginal languages,Tigrigna,16645,10205,130,21340 +Aboriginal languages,Tlingit,95,0,10,260 +Aboriginal languages,Tsimshian,200,30,10,410 +Non-Official & Non-Aboriginal languages,"Turkic languages, n.i.e.",1315,455,10,1875 +Non-Official & Non-Aboriginal languages,Turkish,32815,18955,690,50770 +Non-Official & Non-Aboriginal languages,Ukrainian,102485,28250,1210,132115 +Non-Official & Non-Aboriginal languages,"Uralic languages, n.i.e.",10,5,0,25 +Non-Official & Non-Aboriginal languages,Urdu,210815,128785,1495,322220 +Non-Official & Non-Aboriginal languages,Uyghur,1035,610,20,1390 +Non-Official & Non-Aboriginal languages,Uzbek,1720,995,15,2465 +Non-Official & Non-Aboriginal languages,Vietnamese,156430,104245,8075,198895 +Non-Official & Non-Aboriginal languages,Vlaams (Flemish),3895,355,35,4400 +Aboriginal languages,"Wakashan languages, n.i.e.",10,0,0,25 +Non-Official & Non-Aboriginal languages,Waray-Waray,1110,310,0,1395 +Non-Official & Non-Aboriginal languages,Welsh,1075,95,0,1695 +Non-Official & Non-Aboriginal languages,Wolof,3990,1385,10,8240 +Aboriginal languages,Woods Cree,1840,800,75,2665 +Non-Official & Non-Aboriginal languages,Wu (Shanghainese),12915,7650,105,16530 +Non-Official & Non-Aboriginal languages,Yiddish,13555,7085,895,20985 +Non-Official & Non-Aboriginal languages,Yoruba,9080,2615,15,22415 diff --git a/source/data/can_lang_no_cols.tsv b/source/data/can_lang_no_cols.tsv new file mode 100644 index 00000000..d610ebe7 --- /dev/null +++ b/source/data/can_lang_no_cols.tsv @@ -0,0 +1,214 @@ +Aboriginal languages Aboriginal languages, n.o.s. 590 235 30 665 +Non-Official & Non-Aboriginal languages Afrikaans 10260 4785 85 23415 +Non-Official & Non-Aboriginal languages Afro-Asiatic languages, n.i.e. 1150 445 10 2775 +Non-Official & Non-Aboriginal languages Akan (Twi) 13460 5985 25 22150 +Non-Official & Non-Aboriginal languages Albanian 26895 13135 345 31930 +Aboriginal languages Algonquian languages, n.i.e. 45 10 0 120 +Aboriginal languages Algonquin 1260 370 40 2480 +Non-Official & Non-Aboriginal languages American Sign Language 2685 3020 1145 21930 +Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670 +Non-Official & Non-Aboriginal languages Arabic 419890 223535 5585 629055 +Non-Official & Non-Aboriginal languages Armenian 33460 21510 450 41295 +Non-Official & Non-Aboriginal languages Assyrian Neo-Aramaic 16070 10510 205 19740 +Aboriginal languages Athabaskan languages, n.i.e. 50 10 0 85 +Aboriginal languages Atikamekw 6150 5465 1100 6645 +Non-Official & Non-Aboriginal languages Austro-Asiatic languages, n.i.e 170 80 0 190 +Non-Official & Non-Aboriginal languages Austronesian languages, n.i.e. 4195 1160 35 5585 +Non-Official & Non-Aboriginal languages Azerbaijani 3255 1245 25 5455 +Aboriginal languages Babine (Wetsuwet'en) 110 20 10 210 +Non-Official & Non-Aboriginal languages Bamanankan 1535 345 0 3190 +Aboriginal languages Beaver 190 50 0 340 +Non-Official & Non-Aboriginal languages Belarusan 810 225 0 2265 +Non-Official & Non-Aboriginal languages Bengali 73125 47350 525 91220 +Non-Official & Non-Aboriginal languages Berber languages, n.i.e. 8985 2615 15 12510 +Non-Official & Non-Aboriginal languages Bikol 1785 290 0 2075 +Non-Official & Non-Aboriginal languages Bilen 805 615 15 1085 +Aboriginal languages Blackfoot 2815 1110 85 5645 +Non-Official & Non-Aboriginal languages Bosnian 12215 6045 155 18265 +Non-Official & Non-Aboriginal languages Bulgarian 20020 11985 200 22425 +Non-Official & Non-Aboriginal languages Burmese 3585 2245 75 4995 +Non-Official & Non-Aboriginal languages Cantonese 565270 400220 58820 699125 +Aboriginal languages Carrier 1025 250 15 2100 +Non-Official & Non-Aboriginal languages Catalan 870 350 30 2035 +Aboriginal languages Cayuga 45 10 10 125 +Non-Official & Non-Aboriginal languages Cebuano 19890 7205 70 27040 +Non-Official & Non-Aboriginal languages Celtic languages, n.i.e. 525 80 10 3595 +Non-Official & Non-Aboriginal languages Chaldean Neo-Aramaic 5545 3445 35 7115 +Aboriginal languages Chilcotin 655 255 15 1150 +Non-Official & Non-Aboriginal languages Chinese languages, n.i.e. 615 280 0 590 +Non-Official & Non-Aboriginal languages Chinese, n.o.s. 38580 23940 2935 41685 +Aboriginal languages Comox 85 0 0 185 +Aboriginal languages Cree, n.o.s. 64050 37950 7800 86115 +Non-Official & Non-Aboriginal languages Creole languages, n.i.e. 4985 2005 15 16635 +Non-Official & Non-Aboriginal languages Creole, n.o.s. 64110 24570 310 133045 +Non-Official & Non-Aboriginal languages Croatian 48200 16775 220 69835 +Non-Official & Non-Aboriginal languages Cushitic languages, n.i.e. 365 180 0 480 +Non-Official & Non-Aboriginal languages Czech 22295 6235 70 28725 +Aboriginal languages Dakota 1210 255 20 1760 +Non-Official & Non-Aboriginal languages Danish 12630 855 85 15750 +Aboriginal languages Dene 10700 7710 770 13060 +Non-Official & Non-Aboriginal languages Dinka 2120 1130 0 2475 +Aboriginal languages Dogrib (Tlicho) 1650 1020 165 2375 +Non-Official & Non-Aboriginal languages Dravidian languages, n.i.e. 490 190 0 790 +Non-Official & Non-Aboriginal languages Dutch 99015 9565 1165 120870 +Non-Official & Non-Aboriginal languages Edo 1670 410 0 3220 +Official languages English 19460850 22162865 15265335 29748265 +Non-Official & Non-Aboriginal languages Estonian 5445 975 55 6070 +Non-Official & Non-Aboriginal languages Ewe 1760 405 10 3000 +Non-Official & Non-Aboriginal languages Fijian 745 195 0 1665 +Non-Official & Non-Aboriginal languages Finnish 15295 2790 105 17590 +Official languages French 7166700 6943800 3825215 10242945 +Non-Official & Non-Aboriginal languages Frisian 2100 185 40 2910 +Non-Official & Non-Aboriginal languages Fulah (Pular, Pulaar, Fulfulde) 2825 825 0 4725 +Non-Official & Non-Aboriginal languages Ga 920 250 0 2250 +Non-Official & Non-Aboriginal languages Ganda 1295 345 25 2495 +Non-Official & Non-Aboriginal languages Georgian 1710 1040 25 2150 +Non-Official & Non-Aboriginal languages German 384040 120335 10065 502735 +Non-Official & Non-Aboriginal languages Germanic languages, n.i.e. 525 1630 725 8705 +Aboriginal languages Gitxsan (Gitksan) 880 315 10 1305 +Non-Official & Non-Aboriginal languages Greek 106525 44550 1020 150965 +Non-Official & Non-Aboriginal languages Gujarati 108780 64150 885 149045 +Aboriginal languages Gwich'in 255 50 10 360 +Aboriginal languages Haida 80 10 0 465 +Aboriginal languages Haisla 90 20 0 175 +Non-Official & Non-Aboriginal languages Haitian Creole 3030 1280 25 6855 +Non-Official & Non-Aboriginal languages Hakka 10910 4085 70 12445 +Aboriginal languages Halkomelem 480 50 20 1060 +Non-Official & Non-Aboriginal languages Harari 1320 735 0 1715 +Non-Official & Non-Aboriginal languages Hebrew 19530 8560 825 75020 +Aboriginal languages Heiltsuk 100 5 10 125 +Non-Official & Non-Aboriginal languages Hiligaynon 6880 2210 25 7925 +Non-Official & Non-Aboriginal languages Hindi 110645 55510 1405 433365 +Non-Official & Non-Aboriginal languages Hmong-Mien languages 795 335 10 870 +Non-Official & Non-Aboriginal languages Hungarian 61235 19480 440 71285 +Non-Official & Non-Aboriginal languages Icelandic 1285 270 0 1780 +Non-Official & Non-Aboriginal languages Igbo 4235 1000 10 8855 +Non-Official & Non-Aboriginal languages Ilocano 26345 9125 110 34530 +Non-Official & Non-Aboriginal languages Indo-Iranian languages, n.i.e. 5185 2380 20 8870 +Aboriginal languages Inuinnaqtun (Inuvialuktun) 1020 165 30 1975 +Aboriginal languages Inuit languages, n.i.e. 310 90 15 470 +Aboriginal languages Inuktitut 35210 29230 8795 40620 +Aboriginal languages Iroquoian languages, n.i.e. 35 5 0 115 +Non-Official & Non-Aboriginal languages Italian 375635 115415 1705 574725 +Non-Official & Non-Aboriginal languages Italic (Romance) languages, n.i.e. 720 175 25 2680 +Non-Official & Non-Aboriginal languages Japanese 43640 19785 3255 83095 +Non-Official & Non-Aboriginal languages Kabyle 13150 5490 15 17120 +Non-Official & Non-Aboriginal languages Kannada 3970 1630 10 8245 +Non-Official & Non-Aboriginal languages Karenic languages 4705 3860 135 4895 +Non-Official & Non-Aboriginal languages Kashmiri 565 135 0 905 +Aboriginal languages Kaska (Nahani) 180 20 10 365 +Non-Official & Non-Aboriginal languages Khmer (Cambodian) 20130 10885 475 27035 +Non-Official & Non-Aboriginal languages Kinyarwanda (Rwanda) 5250 1530 25 7860 +Non-Official & Non-Aboriginal languages Konkani 3330 720 10 6790 +Non-Official & Non-Aboriginal languages Korean 153425 109705 12150 172750 +Non-Official & Non-Aboriginal languages Kurdish 11705 6580 185 15290 +Aboriginal languages Kutenai 110 10 0 170 +Aboriginal languages Kwakiutl (Kwak'wala) 325 25 15 605 +Non-Official & Non-Aboriginal languages Lao 12670 6175 150 17235 +Non-Official & Non-Aboriginal languages Latvian 5450 1255 35 6500 +Aboriginal languages Lillooet 315 25 15 790 +Non-Official & Non-Aboriginal languages Lingala 3805 1045 10 17010 +Non-Official & Non-Aboriginal languages Lithuanian 7075 2015 60 8185 +Non-Official & Non-Aboriginal languages Macedonian 16770 6830 95 23075 +Non-Official & Non-Aboriginal languages Malagasy 1430 430 0 2340 +Non-Official & Non-Aboriginal languages Malay 12275 3625 140 22470 +Non-Official & Non-Aboriginal languages Malayalam 28565 15440 95 37810 +Aboriginal languages Malecite 300 55 10 760 +Non-Official & Non-Aboriginal languages Maltese 5565 1125 25 7625 +Non-Official & Non-Aboriginal languages Mandarin 592040 462890 60090 814450 +Non-Official & Non-Aboriginal languages Marathi 8295 3780 30 15565 +Aboriginal languages Mi'kmaq 6690 3565 915 9025 +Aboriginal languages Michif 465 80 10 1210 +Non-Official & Non-Aboriginal languages Min Dong 1230 345 30 1045 +Non-Official & Non-Aboriginal languages Min Nan (Chaochow, Teochow, Fukien, Taiwanese) 31800 13965 565 42840 +Aboriginal languages Mohawk 985 255 30 2415 +Non-Official & Non-Aboriginal languages Mongolian 1575 905 10 2095 +Aboriginal languages Montagnais (Innu) 10235 8585 2055 11445 +Aboriginal languages Moose Cree 105 10 0 195 +Aboriginal languages Naskapi 1205 1195 370 1465 +Non-Official & Non-Aboriginal languages Nepali 18275 13375 195 21385 +Non-Official & Non-Aboriginal languages Niger-Congo languages, n.i.e. 19135 4010 30 40760 +Non-Official & Non-Aboriginal languages Nilo-Saharan languages, n.i.e. 3750 1520 0 4550 +Aboriginal languages Nisga'a 400 75 10 1055 +Aboriginal languages North Slavey (Hare) 765 340 95 1005 +Aboriginal languages Northern East Cree 315 110 35 550 +Aboriginal languages Northern Tutchone 220 30 0 280 +Non-Official & Non-Aboriginal languages Norwegian 4615 350 70 8120 +Aboriginal languages Nuu-chah-nulth (Nootka) 280 30 10 560 +Aboriginal languages Oji-Cree 12855 7905 1080 15605 +Aboriginal languages Ojibway 17885 6175 765 28580 +Aboriginal languages Okanagan 275 80 20 820 +Aboriginal languages Oneida 60 15 0 185 +Non-Official & Non-Aboriginal languages Oriya (Odia) 1055 475 0 1530 +Non-Official & Non-Aboriginal languages Oromo 4960 3410 45 6245 +Non-Official & Non-Aboriginal languages Other languages, n.i.e. 3685 1110 80 9730 +Aboriginal languages Ottawa (Odawa) 150 75 0 205 +Non-Official & Non-Aboriginal languages Pampangan (Kapampangan, Pampango) 4045 1200 10 5425 +Non-Official & Non-Aboriginal languages Pangasinan 1390 240 0 1800 +Non-Official & Non-Aboriginal languages Pashto 16905 10590 50 23180 +Non-Official & Non-Aboriginal languages Persian (Farsi) 214200 143025 4580 252325 +Aboriginal languages Plains Cree 3065 1345 95 5905 +Non-Official & Non-Aboriginal languages Polish 181710 74780 2495 214965 +Non-Official & Non-Aboriginal languages Portuguese 221535 98710 7485 295955 +Non-Official & Non-Aboriginal languages Punjabi (Panjabi) 501680 349140 27865 668240 +Non-Official & Non-Aboriginal languages Quebec Sign Language 695 730 130 4665 +Non-Official & Non-Aboriginal languages Romanian 96660 53325 745 115050 +Non-Official & Non-Aboriginal languages Rundi (Kirundi) 5850 2110 0 8590 +Non-Official & Non-Aboriginal languages Russian 188255 116595 4855 269645 +Aboriginal languages Salish languages, n.i.e. 260 25 0 560 +Aboriginal languages Sarsi (Sarcee) 80 10 0 145 +Non-Official & Non-Aboriginal languages Scottish Gaelic 1090 190 15 3980 +Aboriginal languages Sekani 85 15 0 185 +Non-Official & Non-Aboriginal languages Semitic languages, n.i.e. 2150 1205 65 3220 +Non-Official & Non-Aboriginal languages Serbian 57350 31750 530 73780 +Non-Official & Non-Aboriginal languages Serbo-Croatian 9550 3890 30 11275 +Non-Official & Non-Aboriginal languages Shona 3185 1035 0 5430 +Aboriginal languages Shuswap (Secwepemctsin) 445 50 35 1305 +Non-Official & Non-Aboriginal languages Sign languages, n.i.e 4125 6690 645 22280 +Non-Official & Non-Aboriginal languages Sindhi 11860 4975 35 20260 +Non-Official & Non-Aboriginal languages Sinhala (Sinhalese) 16335 7790 40 27825 +Aboriginal languages Siouan languages, n.i.e. 55 20 0 140 +Aboriginal languages Slavey, n.o.s. 280 105 10 675 +Non-Official & Non-Aboriginal languages Slavic languages, n.i.e. 2420 670 10 2995 +Non-Official & Non-Aboriginal languages Slovak 17580 5610 100 21470 +Non-Official & Non-Aboriginal languages Slovene (Slovenian) 9785 2055 15 11490 +Non-Official & Non-Aboriginal languages Somali 36755 22895 220 49660 +Aboriginal languages South Slavey 945 370 35 1365 +Aboriginal languages Southern East Cree 45 15 0 40 +Aboriginal languages Southern Tutchone 70 5 0 145 +Non-Official & Non-Aboriginal languages Spanish 458850 263505 13030 995260 +Aboriginal languages Squamish 40 5 10 285 +Aboriginal languages Stoney 3025 1950 240 3675 +Aboriginal languages Straits 80 25 15 365 +Non-Official & Non-Aboriginal languages Swahili 13370 5370 80 38685 +Aboriginal languages Swampy Cree 1440 330 10 2350 +Non-Official & Non-Aboriginal languages Swedish 6840 1050 125 14140 +Non-Official & Non-Aboriginal languages Tagalog (Pilipino, Filipino) 431385 213790 3450 612735 +Aboriginal languages Tahltan 95 5 0 265 +Non-Official & Non-Aboriginal languages Tai-Kadai languages, n.i.e 85 30 0 115 +Non-Official & Non-Aboriginal languages Tamil 140720 96955 2085 189860 +Non-Official & Non-Aboriginal languages Telugu 15660 8280 40 23165 +Non-Official & Non-Aboriginal languages Thai 9255 3365 525 15395 +Aboriginal languages Thompson (Ntlakapamux) 335 20 0 450 +Non-Official & Non-Aboriginal languages Tibetan 6160 4590 50 7050 +Non-Official & Non-Aboriginal languages Tibeto-Burman languages, n.i.e. 1405 655 15 2380 +Non-Official & Non-Aboriginal languages Tigrigna 16645 10205 130 21340 +Aboriginal languages Tlingit 95 0 10 260 +Aboriginal languages Tsimshian 200 30 10 410 +Non-Official & Non-Aboriginal languages Turkic languages, n.i.e. 1315 455 10 1875 +Non-Official & Non-Aboriginal languages Turkish 32815 18955 690 50770 +Non-Official & Non-Aboriginal languages Ukrainian 102485 28250 1210 132115 +Non-Official & Non-Aboriginal languages Uralic languages, n.i.e. 10 5 0 25 +Non-Official & Non-Aboriginal languages Urdu 210815 128785 1495 322220 +Non-Official & Non-Aboriginal languages Uyghur 1035 610 20 1390 +Non-Official & Non-Aboriginal languages Uzbek 1720 995 15 2465 +Non-Official & Non-Aboriginal languages Vietnamese 156430 104245 8075 198895 +Non-Official & Non-Aboriginal languages Vlaams (Flemish) 3895 355 35 4400 +Aboriginal languages Wakashan languages, n.i.e. 10 0 0 25 +Non-Official & Non-Aboriginal languages Waray-Waray 1110 310 0 1395 +Non-Official & Non-Aboriginal languages Welsh 1075 95 0 1695 +Non-Official & Non-Aboriginal languages Wolof 3990 1385 10 8240 +Aboriginal languages Woods Cree 1840 800 75 2665 +Non-Official & Non-Aboriginal languages Wu (Shanghainese) 12915 7650 105 16530 +Non-Official & Non-Aboriginal languages Yiddish 13555 7085 895 20985 +Non-Official & Non-Aboriginal languages Yoruba 9080 2615 15 22415 diff --git a/source/data/no_official_languages.csv b/source/data/no_official_languages.csv index 019589ce..5278e287 100644 --- a/source/data/no_official_languages.csv +++ b/source/data/no_official_languages.csv @@ -1,213 +1,213 @@ -,category,language,mother_tongue,most_at_home,most_at_work,lang_known -0,Aboriginal languages,"Aboriginal languages, n.o.s.",590,235,30,665 -1,Non-Official & Non-Aboriginal languages,Afrikaans,10260,4785,85,23415 -2,Non-Official & Non-Aboriginal languages,"Afro-Asiatic languages, n.i.e.",1150,445,10,2775 -3,Non-Official & Non-Aboriginal languages,Akan (Twi),13460,5985,25,22150 -4,Non-Official & Non-Aboriginal languages,Albanian,26895,13135,345,31930 -5,Aboriginal languages,"Algonquian languages, n.i.e.",45,10,0,120 -6,Aboriginal languages,Algonquin,1260,370,40,2480 -7,Non-Official & Non-Aboriginal languages,American Sign Language,2685,3020,1145,21930 -8,Non-Official & Non-Aboriginal languages,Amharic,22465,12785,200,33670 -9,Non-Official & Non-Aboriginal languages,Arabic,419890,223535,5585,629055 -10,Non-Official & Non-Aboriginal languages,Armenian,33460,21510,450,41295 -11,Non-Official & Non-Aboriginal languages,Assyrian Neo-Aramaic,16070,10510,205,19740 -12,Aboriginal languages,"Athabaskan languages, n.i.e.",50,10,0,85 -13,Aboriginal languages,Atikamekw,6150,5465,1100,6645 -14,Non-Official & Non-Aboriginal languages,"Austro-Asiatic languages, n.i.e",170,80,0,190 -15,Non-Official & Non-Aboriginal languages,"Austronesian languages, n.i.e.",4195,1160,35,5585 -16,Non-Official & Non-Aboriginal languages,Azerbaijani,3255,1245,25,5455 -17,Aboriginal languages,Babine (Wetsuwet'en),110,20,10,210 -18,Non-Official & Non-Aboriginal languages,Bamanankan,1535,345,0,3190 -19,Aboriginal languages,Beaver,190,50,0,340 -20,Non-Official & Non-Aboriginal languages,Belarusan,810,225,0,2265 -21,Non-Official & Non-Aboriginal languages,Bengali,73125,47350,525,91220 -22,Non-Official & Non-Aboriginal languages,"Berber languages, n.i.e.",8985,2615,15,12510 -23,Non-Official & Non-Aboriginal languages,Bikol,1785,290,0,2075 -24,Non-Official & Non-Aboriginal languages,Bilen,805,615,15,1085 -25,Aboriginal languages,Blackfoot,2815,1110,85,5645 -26,Non-Official & Non-Aboriginal languages,Bosnian,12215,6045,155,18265 -27,Non-Official & Non-Aboriginal languages,Bulgarian,20020,11985,200,22425 -28,Non-Official & Non-Aboriginal languages,Burmese,3585,2245,75,4995 -29,Non-Official & Non-Aboriginal languages,Cantonese,565270,400220,58820,699125 -30,Aboriginal languages,Carrier,1025,250,15,2100 -31,Non-Official & Non-Aboriginal languages,Catalan,870,350,30,2035 -32,Aboriginal languages,Cayuga,45,10,10,125 -33,Non-Official & Non-Aboriginal languages,Cebuano,19890,7205,70,27040 -34,Non-Official & Non-Aboriginal languages,"Celtic languages, n.i.e.",525,80,10,3595 -35,Non-Official & Non-Aboriginal languages,Chaldean Neo-Aramaic,5545,3445,35,7115 -36,Aboriginal languages,Chilcotin,655,255,15,1150 -37,Non-Official & Non-Aboriginal languages,"Chinese languages, n.i.e.",615,280,0,590 -38,Non-Official & Non-Aboriginal languages,"Chinese, n.o.s.",38580,23940,2935,41685 -39,Aboriginal languages,Comox,85,0,0,185 -40,Aboriginal languages,"Cree, n.o.s.",64050,37950,7800,86115 -41,Non-Official & Non-Aboriginal languages,"Creole languages, n.i.e.",4985,2005,15,16635 -42,Non-Official & Non-Aboriginal languages,"Creole, n.o.s.",64110,24570,310,133045 -43,Non-Official & Non-Aboriginal languages,Croatian,48200,16775,220,69835 -44,Non-Official & Non-Aboriginal languages,"Cushitic languages, n.i.e.",365,180,0,480 -45,Non-Official & Non-Aboriginal languages,Czech,22295,6235,70,28725 -46,Aboriginal languages,Dakota,1210,255,20,1760 -47,Non-Official & Non-Aboriginal languages,Danish,12630,855,85,15750 -48,Aboriginal languages,Dene,10700,7710,770,13060 -49,Non-Official & Non-Aboriginal languages,Dinka,2120,1130,0,2475 -50,Aboriginal languages,Dogrib (Tlicho),1650,1020,165,2375 -51,Non-Official & Non-Aboriginal languages,"Dravidian languages, n.i.e.",490,190,0,790 -52,Non-Official & Non-Aboriginal languages,Dutch,99015,9565,1165,120870 -53,Non-Official & Non-Aboriginal languages,Edo,1670,410,0,3220 -55,Non-Official & Non-Aboriginal languages,Estonian,5445,975,55,6070 -56,Non-Official & Non-Aboriginal languages,Ewe,1760,405,10,3000 -57,Non-Official & Non-Aboriginal languages,Fijian,745,195,0,1665 -58,Non-Official & Non-Aboriginal languages,Finnish,15295,2790,105,17590 -60,Non-Official & Non-Aboriginal languages,Frisian,2100,185,40,2910 -61,Non-Official & Non-Aboriginal languages,"Fulah (Pular, Pulaar, Fulfulde)",2825,825,0,4725 -62,Non-Official & Non-Aboriginal languages,Ga,920,250,0,2250 -63,Non-Official & Non-Aboriginal languages,Ganda,1295,345,25,2495 -64,Non-Official & Non-Aboriginal languages,Georgian,1710,1040,25,2150 -65,Non-Official & Non-Aboriginal languages,German,384040,120335,10065,502735 -66,Non-Official & Non-Aboriginal languages,"Germanic languages, n.i.e.",525,1630,725,8705 -67,Aboriginal languages,Gitxsan (Gitksan),880,315,10,1305 -68,Non-Official & Non-Aboriginal languages,Greek,106525,44550,1020,150965 -69,Non-Official & Non-Aboriginal languages,Gujarati,108780,64150,885,149045 -70,Aboriginal languages,Gwich'in,255,50,10,360 -71,Aboriginal languages,Haida,80,10,0,465 -72,Aboriginal languages,Haisla,90,20,0,175 -73,Non-Official & Non-Aboriginal languages,Haitian Creole,3030,1280,25,6855 -74,Non-Official & Non-Aboriginal languages,Hakka,10910,4085,70,12445 -75,Aboriginal languages,Halkomelem,480,50,20,1060 -76,Non-Official & Non-Aboriginal languages,Harari,1320,735,0,1715 -77,Non-Official & Non-Aboriginal languages,Hebrew,19530,8560,825,75020 -78,Aboriginal languages,Heiltsuk,100,5,10,125 -79,Non-Official & Non-Aboriginal languages,Hiligaynon,6880,2210,25,7925 -80,Non-Official & Non-Aboriginal languages,Hindi,110645,55510,1405,433365 -81,Non-Official & Non-Aboriginal languages,Hmong-Mien languages,795,335,10,870 -82,Non-Official & Non-Aboriginal languages,Hungarian,61235,19480,440,71285 -83,Non-Official & Non-Aboriginal languages,Icelandic,1285,270,0,1780 -84,Non-Official & Non-Aboriginal languages,Igbo,4235,1000,10,8855 -85,Non-Official & Non-Aboriginal languages,Ilocano,26345,9125,110,34530 -86,Non-Official & Non-Aboriginal languages,"Indo-Iranian languages, n.i.e.",5185,2380,20,8870 -87,Aboriginal languages,Inuinnaqtun (Inuvialuktun),1020,165,30,1975 -88,Aboriginal languages,"Inuit languages, n.i.e.",310,90,15,470 -89,Aboriginal languages,Inuktitut,35210,29230,8795,40620 -90,Aboriginal languages,"Iroquoian languages, n.i.e.",35,5,0,115 -91,Non-Official & Non-Aboriginal languages,Italian,375635,115415,1705,574725 -92,Non-Official & Non-Aboriginal languages,"Italic (Romance) languages, n.i.e.",720,175,25,2680 -93,Non-Official & Non-Aboriginal languages,Japanese,43640,19785,3255,83095 -94,Non-Official & Non-Aboriginal languages,Kabyle,13150,5490,15,17120 -95,Non-Official & Non-Aboriginal languages,Kannada,3970,1630,10,8245 -96,Non-Official & Non-Aboriginal languages,Karenic languages,4705,3860,135,4895 -97,Non-Official & Non-Aboriginal languages,Kashmiri,565,135,0,905 -98,Aboriginal languages,Kaska (Nahani),180,20,10,365 -99,Non-Official & Non-Aboriginal languages,Khmer (Cambodian),20130,10885,475,27035 -100,Non-Official & Non-Aboriginal languages,Kinyarwanda (Rwanda),5250,1530,25,7860 -101,Non-Official & Non-Aboriginal languages,Konkani,3330,720,10,6790 -102,Non-Official & Non-Aboriginal languages,Korean,153425,109705,12150,172750 -103,Non-Official & Non-Aboriginal languages,Kurdish,11705,6580,185,15290 -104,Aboriginal languages,Kutenai,110,10,0,170 -105,Aboriginal languages,Kwakiutl (Kwak'wala),325,25,15,605 -106,Non-Official & Non-Aboriginal languages,Lao,12670,6175,150,17235 -107,Non-Official & Non-Aboriginal languages,Latvian,5450,1255,35,6500 -108,Aboriginal languages,Lillooet,315,25,15,790 -109,Non-Official & Non-Aboriginal languages,Lingala,3805,1045,10,17010 -110,Non-Official & Non-Aboriginal languages,Lithuanian,7075,2015,60,8185 -111,Non-Official & Non-Aboriginal languages,Macedonian,16770,6830,95,23075 -112,Non-Official & Non-Aboriginal languages,Malagasy,1430,430,0,2340 -113,Non-Official & Non-Aboriginal languages,Malay,12275,3625,140,22470 -114,Non-Official & Non-Aboriginal languages,Malayalam,28565,15440,95,37810 -115,Aboriginal languages,Malecite,300,55,10,760 -116,Non-Official & Non-Aboriginal languages,Maltese,5565,1125,25,7625 -117,Non-Official & Non-Aboriginal languages,Mandarin,592040,462890,60090,814450 -118,Non-Official & Non-Aboriginal languages,Marathi,8295,3780,30,15565 -119,Aboriginal languages,Mi'kmaq,6690,3565,915,9025 -120,Aboriginal languages,Michif,465,80,10,1210 -121,Non-Official & Non-Aboriginal languages,Min Dong,1230,345,30,1045 -122,Non-Official & Non-Aboriginal languages,"Min Nan (Chaochow, Teochow, Fukien, Taiwanese)",31800,13965,565,42840 -123,Aboriginal languages,Mohawk,985,255,30,2415 -124,Non-Official & Non-Aboriginal languages,Mongolian,1575,905,10,2095 -125,Aboriginal languages,Montagnais (Innu),10235,8585,2055,11445 -126,Aboriginal languages,Moose Cree,105,10,0,195 -127,Aboriginal languages,Naskapi,1205,1195,370,1465 -128,Non-Official & Non-Aboriginal languages,Nepali,18275,13375,195,21385 -129,Non-Official & Non-Aboriginal languages,"Niger-Congo languages, n.i.e.",19135,4010,30,40760 -130,Non-Official & Non-Aboriginal languages,"Nilo-Saharan languages, n.i.e.",3750,1520,0,4550 -131,Aboriginal languages,Nisga'a,400,75,10,1055 -132,Aboriginal languages,North Slavey (Hare),765,340,95,1005 -133,Aboriginal languages,Northern East Cree,315,110,35,550 -134,Aboriginal languages,Northern Tutchone,220,30,0,280 -135,Non-Official & Non-Aboriginal languages,Norwegian,4615,350,70,8120 -136,Aboriginal languages,Nuu-chah-nulth (Nootka),280,30,10,560 -137,Aboriginal languages,Oji-Cree,12855,7905,1080,15605 -138,Aboriginal languages,Ojibway,17885,6175,765,28580 -139,Aboriginal languages,Okanagan,275,80,20,820 -140,Aboriginal languages,Oneida,60,15,0,185 -141,Non-Official & Non-Aboriginal languages,Oriya (Odia),1055,475,0,1530 -142,Non-Official & Non-Aboriginal languages,Oromo,4960,3410,45,6245 -143,Non-Official & Non-Aboriginal languages,"Other languages, n.i.e.",3685,1110,80,9730 -144,Aboriginal languages,Ottawa (Odawa),150,75,0,205 -145,Non-Official & Non-Aboriginal languages,"Pampangan (Kapampangan, Pampango)",4045,1200,10,5425 -146,Non-Official & Non-Aboriginal languages,Pangasinan,1390,240,0,1800 -147,Non-Official & Non-Aboriginal languages,Pashto,16905,10590,50,23180 -148,Non-Official & Non-Aboriginal languages,Persian (Farsi),214200,143025,4580,252325 -149,Aboriginal languages,Plains Cree,3065,1345,95,5905 -150,Non-Official & Non-Aboriginal languages,Polish,181710,74780,2495,214965 -151,Non-Official & Non-Aboriginal languages,Portuguese,221535,98710,7485,295955 -152,Non-Official & Non-Aboriginal languages,Punjabi (Panjabi),501680,349140,27865,668240 -153,Non-Official & Non-Aboriginal languages,Quebec Sign Language,695,730,130,4665 -154,Non-Official & Non-Aboriginal languages,Romanian,96660,53325,745,115050 -155,Non-Official & Non-Aboriginal languages,Rundi (Kirundi),5850,2110,0,8590 -156,Non-Official & Non-Aboriginal languages,Russian,188255,116595,4855,269645 -157,Aboriginal languages,"Salish languages, n.i.e.",260,25,0,560 -158,Aboriginal languages,Sarsi (Sarcee),80,10,0,145 -159,Non-Official & Non-Aboriginal languages,Scottish Gaelic,1090,190,15,3980 -160,Aboriginal languages,Sekani,85,15,0,185 -161,Non-Official & Non-Aboriginal languages,"Semitic languages, n.i.e.",2150,1205,65,3220 -162,Non-Official & Non-Aboriginal languages,Serbian,57350,31750,530,73780 -163,Non-Official & Non-Aboriginal languages,Serbo-Croatian,9550,3890,30,11275 -164,Non-Official & Non-Aboriginal languages,Shona,3185,1035,0,5430 -165,Aboriginal languages,Shuswap (Secwepemctsin),445,50,35,1305 -166,Non-Official & Non-Aboriginal languages,"Sign languages, n.i.e",4125,6690,645,22280 -167,Non-Official & Non-Aboriginal languages,Sindhi,11860,4975,35,20260 -168,Non-Official & Non-Aboriginal languages,Sinhala (Sinhalese),16335,7790,40,27825 -169,Aboriginal languages,"Siouan languages, n.i.e.",55,20,0,140 -170,Aboriginal languages,"Slavey, n.o.s.",280,105,10,675 -171,Non-Official & Non-Aboriginal languages,"Slavic languages, n.i.e.",2420,670,10,2995 -172,Non-Official & Non-Aboriginal languages,Slovak,17580,5610,100,21470 -173,Non-Official & Non-Aboriginal languages,Slovene (Slovenian),9785,2055,15,11490 -174,Non-Official & Non-Aboriginal languages,Somali,36755,22895,220,49660 -175,Aboriginal languages,South Slavey,945,370,35,1365 -176,Aboriginal languages,Southern East Cree,45,15,0,40 -177,Aboriginal languages,Southern Tutchone,70,5,0,145 -178,Non-Official & Non-Aboriginal languages,Spanish,458850,263505,13030,995260 -179,Aboriginal languages,Squamish,40,5,10,285 -180,Aboriginal languages,Stoney,3025,1950,240,3675 -181,Aboriginal languages,Straits,80,25,15,365 -182,Non-Official & Non-Aboriginal languages,Swahili,13370,5370,80,38685 -183,Aboriginal languages,Swampy Cree,1440,330,10,2350 -184,Non-Official & Non-Aboriginal languages,Swedish,6840,1050,125,14140 -185,Non-Official & Non-Aboriginal languages,"Tagalog (Pilipino, Filipino)",431385,213790,3450,612735 -186,Aboriginal languages,Tahltan,95,5,0,265 -187,Non-Official & Non-Aboriginal languages,"Tai-Kadai languages, n.i.e",85,30,0,115 -188,Non-Official & Non-Aboriginal languages,Tamil,140720,96955,2085,189860 -189,Non-Official & Non-Aboriginal languages,Telugu,15660,8280,40,23165 -190,Non-Official & Non-Aboriginal languages,Thai,9255,3365,525,15395 -191,Aboriginal languages,Thompson (Ntlakapamux),335,20,0,450 -192,Non-Official & Non-Aboriginal languages,Tibetan,6160,4590,50,7050 -193,Non-Official & Non-Aboriginal languages,"Tibeto-Burman languages, n.i.e.",1405,655,15,2380 -194,Non-Official & Non-Aboriginal languages,Tigrigna,16645,10205,130,21340 -195,Aboriginal languages,Tlingit,95,0,10,260 -196,Aboriginal languages,Tsimshian,200,30,10,410 -197,Non-Official & Non-Aboriginal languages,"Turkic languages, n.i.e.",1315,455,10,1875 -198,Non-Official & Non-Aboriginal languages,Turkish,32815,18955,690,50770 -199,Non-Official & Non-Aboriginal languages,Ukrainian,102485,28250,1210,132115 -200,Non-Official & Non-Aboriginal languages,"Uralic languages, n.i.e.",10,5,0,25 -201,Non-Official & Non-Aboriginal languages,Urdu,210815,128785,1495,322220 -202,Non-Official & Non-Aboriginal languages,Uyghur,1035,610,20,1390 -203,Non-Official & Non-Aboriginal languages,Uzbek,1720,995,15,2465 -204,Non-Official & Non-Aboriginal languages,Vietnamese,156430,104245,8075,198895 -205,Non-Official & Non-Aboriginal languages,Vlaams (Flemish),3895,355,35,4400 -206,Aboriginal languages,"Wakashan languages, n.i.e.",10,0,0,25 -207,Non-Official & Non-Aboriginal languages,Waray-Waray,1110,310,0,1395 -208,Non-Official & Non-Aboriginal languages,Welsh,1075,95,0,1695 -209,Non-Official & Non-Aboriginal languages,Wolof,3990,1385,10,8240 -210,Aboriginal languages,Woods Cree,1840,800,75,2665 -211,Non-Official & Non-Aboriginal languages,Wu (Shanghainese),12915,7650,105,16530 -212,Non-Official & Non-Aboriginal languages,Yiddish,13555,7085,895,20985 -213,Non-Official & Non-Aboriginal languages,Yoruba,9080,2615,15,22415 +category,language,mother_tongue,most_at_home,most_at_work,lang_known +Aboriginal languages,"Aboriginal languages, n.o.s.",590,235,30,665 +Non-Official & Non-Aboriginal languages,Afrikaans,10260,4785,85,23415 +Non-Official & Non-Aboriginal languages,"Afro-Asiatic languages, n.i.e.",1150,445,10,2775 +Non-Official & Non-Aboriginal languages,Akan (Twi),13460,5985,25,22150 +Non-Official & Non-Aboriginal languages,Albanian,26895,13135,345,31930 +Aboriginal languages,"Algonquian languages, n.i.e.",45,10,0,120 +Aboriginal languages,Algonquin,1260,370,40,2480 +Non-Official & Non-Aboriginal languages,American Sign Language,2685,3020,1145,21930 +Non-Official & Non-Aboriginal languages,Amharic,22465,12785,200,33670 +Non-Official & Non-Aboriginal languages,Arabic,419890,223535,5585,629055 +Non-Official & Non-Aboriginal languages,Armenian,33460,21510,450,41295 +Non-Official & Non-Aboriginal languages,Assyrian Neo-Aramaic,16070,10510,205,19740 +Aboriginal languages,"Athabaskan languages, n.i.e.",50,10,0,85 +Aboriginal languages,Atikamekw,6150,5465,1100,6645 +Non-Official & Non-Aboriginal languages,"Austro-Asiatic languages, n.i.e",170,80,0,190 +Non-Official & Non-Aboriginal languages,"Austronesian languages, n.i.e.",4195,1160,35,5585 +Non-Official & Non-Aboriginal languages,Azerbaijani,3255,1245,25,5455 +Aboriginal languages,Babine (Wetsuwet'en),110,20,10,210 +Non-Official & Non-Aboriginal languages,Bamanankan,1535,345,0,3190 +Aboriginal languages,Beaver,190,50,0,340 +Non-Official & Non-Aboriginal languages,Belarusan,810,225,0,2265 +Non-Official & Non-Aboriginal languages,Bengali,73125,47350,525,91220 +Non-Official & Non-Aboriginal languages,"Berber languages, n.i.e.",8985,2615,15,12510 +Non-Official & Non-Aboriginal languages,Bikol,1785,290,0,2075 +Non-Official & Non-Aboriginal languages,Bilen,805,615,15,1085 +Aboriginal languages,Blackfoot,2815,1110,85,5645 +Non-Official & Non-Aboriginal languages,Bosnian,12215,6045,155,18265 +Non-Official & Non-Aboriginal languages,Bulgarian,20020,11985,200,22425 +Non-Official & Non-Aboriginal languages,Burmese,3585,2245,75,4995 +Non-Official & Non-Aboriginal languages,Cantonese,565270,400220,58820,699125 +Aboriginal languages,Carrier,1025,250,15,2100 +Non-Official & Non-Aboriginal languages,Catalan,870,350,30,2035 +Aboriginal languages,Cayuga,45,10,10,125 +Non-Official & Non-Aboriginal languages,Cebuano,19890,7205,70,27040 +Non-Official & Non-Aboriginal languages,"Celtic languages, n.i.e.",525,80,10,3595 +Non-Official & Non-Aboriginal languages,Chaldean Neo-Aramaic,5545,3445,35,7115 +Aboriginal languages,Chilcotin,655,255,15,1150 +Non-Official & Non-Aboriginal languages,"Chinese languages, n.i.e.",615,280,0,590 +Non-Official & Non-Aboriginal languages,"Chinese, n.o.s.",38580,23940,2935,41685 +Aboriginal languages,Comox,85,0,0,185 +Aboriginal languages,"Cree, n.o.s.",64050,37950,7800,86115 +Non-Official & Non-Aboriginal languages,"Creole languages, n.i.e.",4985,2005,15,16635 +Non-Official & Non-Aboriginal languages,"Creole, n.o.s.",64110,24570,310,133045 +Non-Official & Non-Aboriginal languages,Croatian,48200,16775,220,69835 +Non-Official & Non-Aboriginal languages,"Cushitic languages, n.i.e.",365,180,0,480 +Non-Official & Non-Aboriginal languages,Czech,22295,6235,70,28725 +Aboriginal languages,Dakota,1210,255,20,1760 +Non-Official & Non-Aboriginal languages,Danish,12630,855,85,15750 +Aboriginal languages,Dene,10700,7710,770,13060 +Non-Official & Non-Aboriginal languages,Dinka,2120,1130,0,2475 +Aboriginal languages,Dogrib (Tlicho),1650,1020,165,2375 +Non-Official & Non-Aboriginal languages,"Dravidian languages, n.i.e.",490,190,0,790 +Non-Official & Non-Aboriginal languages,Dutch,99015,9565,1165,120870 +Non-Official & Non-Aboriginal languages,Edo,1670,410,0,3220 +Non-Official & Non-Aboriginal languages,Estonian,5445,975,55,6070 +Non-Official & Non-Aboriginal languages,Ewe,1760,405,10,3000 +Non-Official & Non-Aboriginal languages,Fijian,745,195,0,1665 +Non-Official & Non-Aboriginal languages,Finnish,15295,2790,105,17590 +Non-Official & Non-Aboriginal languages,Frisian,2100,185,40,2910 +Non-Official & Non-Aboriginal languages,"Fulah (Pular, Pulaar, Fulfulde)",2825,825,0,4725 +Non-Official & Non-Aboriginal languages,Ga,920,250,0,2250 +Non-Official & Non-Aboriginal languages,Ganda,1295,345,25,2495 +Non-Official & Non-Aboriginal languages,Georgian,1710,1040,25,2150 +Non-Official & Non-Aboriginal languages,German,384040,120335,10065,502735 +Non-Official & Non-Aboriginal languages,"Germanic languages, n.i.e.",525,1630,725,8705 +Aboriginal languages,Gitxsan (Gitksan),880,315,10,1305 +Non-Official & Non-Aboriginal languages,Greek,106525,44550,1020,150965 +Non-Official & Non-Aboriginal languages,Gujarati,108780,64150,885,149045 +Aboriginal languages,Gwich'in,255,50,10,360 +Aboriginal languages,Haida,80,10,0,465 +Aboriginal languages,Haisla,90,20,0,175 +Non-Official & Non-Aboriginal languages,Haitian Creole,3030,1280,25,6855 +Non-Official & Non-Aboriginal languages,Hakka,10910,4085,70,12445 +Aboriginal languages,Halkomelem,480,50,20,1060 +Non-Official & Non-Aboriginal languages,Harari,1320,735,0,1715 +Non-Official & Non-Aboriginal languages,Hebrew,19530,8560,825,75020 +Aboriginal languages,Heiltsuk,100,5,10,125 +Non-Official & Non-Aboriginal languages,Hiligaynon,6880,2210,25,7925 +Non-Official & Non-Aboriginal languages,Hindi,110645,55510,1405,433365 +Non-Official & Non-Aboriginal languages,Hmong-Mien languages,795,335,10,870 +Non-Official & Non-Aboriginal languages,Hungarian,61235,19480,440,71285 +Non-Official & Non-Aboriginal languages,Icelandic,1285,270,0,1780 +Non-Official & Non-Aboriginal languages,Igbo,4235,1000,10,8855 +Non-Official & Non-Aboriginal languages,Ilocano,26345,9125,110,34530 +Non-Official & Non-Aboriginal languages,"Indo-Iranian languages, n.i.e.",5185,2380,20,8870 +Aboriginal languages,Inuinnaqtun (Inuvialuktun),1020,165,30,1975 +Aboriginal languages,"Inuit languages, n.i.e.",310,90,15,470 +Aboriginal languages,Inuktitut,35210,29230,8795,40620 +Aboriginal languages,"Iroquoian languages, n.i.e.",35,5,0,115 +Non-Official & Non-Aboriginal languages,Italian,375635,115415,1705,574725 +Non-Official & Non-Aboriginal languages,"Italic (Romance) languages, n.i.e.",720,175,25,2680 +Non-Official & Non-Aboriginal languages,Japanese,43640,19785,3255,83095 +Non-Official & Non-Aboriginal languages,Kabyle,13150,5490,15,17120 +Non-Official & Non-Aboriginal languages,Kannada,3970,1630,10,8245 +Non-Official & Non-Aboriginal languages,Karenic languages,4705,3860,135,4895 +Non-Official & Non-Aboriginal languages,Kashmiri,565,135,0,905 +Aboriginal languages,Kaska (Nahani),180,20,10,365 +Non-Official & Non-Aboriginal languages,Khmer (Cambodian),20130,10885,475,27035 +Non-Official & Non-Aboriginal languages,Kinyarwanda (Rwanda),5250,1530,25,7860 +Non-Official & Non-Aboriginal languages,Konkani,3330,720,10,6790 +Non-Official & Non-Aboriginal languages,Korean,153425,109705,12150,172750 +Non-Official & Non-Aboriginal languages,Kurdish,11705,6580,185,15290 +Aboriginal languages,Kutenai,110,10,0,170 +Aboriginal languages,Kwakiutl (Kwak'wala),325,25,15,605 +Non-Official & Non-Aboriginal languages,Lao,12670,6175,150,17235 +Non-Official & Non-Aboriginal languages,Latvian,5450,1255,35,6500 +Aboriginal languages,Lillooet,315,25,15,790 +Non-Official & Non-Aboriginal languages,Lingala,3805,1045,10,17010 +Non-Official & Non-Aboriginal languages,Lithuanian,7075,2015,60,8185 +Non-Official & Non-Aboriginal languages,Macedonian,16770,6830,95,23075 +Non-Official & Non-Aboriginal languages,Malagasy,1430,430,0,2340 +Non-Official & Non-Aboriginal languages,Malay,12275,3625,140,22470 +Non-Official & Non-Aboriginal languages,Malayalam,28565,15440,95,37810 +Aboriginal languages,Malecite,300,55,10,760 +Non-Official & Non-Aboriginal languages,Maltese,5565,1125,25,7625 +Non-Official & Non-Aboriginal languages,Mandarin,592040,462890,60090,814450 +Non-Official & Non-Aboriginal languages,Marathi,8295,3780,30,15565 +Aboriginal languages,Mi'kmaq,6690,3565,915,9025 +Aboriginal languages,Michif,465,80,10,1210 +Non-Official & Non-Aboriginal languages,Min Dong,1230,345,30,1045 +Non-Official & Non-Aboriginal languages,"Min Nan (Chaochow, Teochow, Fukien, Taiwanese)",31800,13965,565,42840 +Aboriginal languages,Mohawk,985,255,30,2415 +Non-Official & Non-Aboriginal languages,Mongolian,1575,905,10,2095 +Aboriginal languages,Montagnais (Innu),10235,8585,2055,11445 +Aboriginal languages,Moose Cree,105,10,0,195 +Aboriginal languages,Naskapi,1205,1195,370,1465 +Non-Official & Non-Aboriginal languages,Nepali,18275,13375,195,21385 +Non-Official & Non-Aboriginal languages,"Niger-Congo languages, n.i.e.",19135,4010,30,40760 +Non-Official & Non-Aboriginal languages,"Nilo-Saharan languages, n.i.e.",3750,1520,0,4550 +Aboriginal languages,Nisga'a,400,75,10,1055 +Aboriginal languages,North Slavey (Hare),765,340,95,1005 +Aboriginal languages,Northern East Cree,315,110,35,550 +Aboriginal languages,Northern Tutchone,220,30,0,280 +Non-Official & Non-Aboriginal languages,Norwegian,4615,350,70,8120 +Aboriginal languages,Nuu-chah-nulth (Nootka),280,30,10,560 +Aboriginal languages,Oji-Cree,12855,7905,1080,15605 +Aboriginal languages,Ojibway,17885,6175,765,28580 +Aboriginal languages,Okanagan,275,80,20,820 +Aboriginal languages,Oneida,60,15,0,185 +Non-Official & Non-Aboriginal languages,Oriya (Odia),1055,475,0,1530 +Non-Official & Non-Aboriginal languages,Oromo,4960,3410,45,6245 +Non-Official & Non-Aboriginal languages,"Other languages, n.i.e.",3685,1110,80,9730 +Aboriginal languages,Ottawa (Odawa),150,75,0,205 +Non-Official & Non-Aboriginal languages,"Pampangan (Kapampangan, Pampango)",4045,1200,10,5425 +Non-Official & Non-Aboriginal languages,Pangasinan,1390,240,0,1800 +Non-Official & Non-Aboriginal languages,Pashto,16905,10590,50,23180 +Non-Official & Non-Aboriginal languages,Persian (Farsi),214200,143025,4580,252325 +Aboriginal languages,Plains Cree,3065,1345,95,5905 +Non-Official & Non-Aboriginal languages,Polish,181710,74780,2495,214965 +Non-Official & Non-Aboriginal languages,Portuguese,221535,98710,7485,295955 +Non-Official & Non-Aboriginal languages,Punjabi (Panjabi),501680,349140,27865,668240 +Non-Official & Non-Aboriginal languages,Quebec Sign Language,695,730,130,4665 +Non-Official & Non-Aboriginal languages,Romanian,96660,53325,745,115050 +Non-Official & Non-Aboriginal languages,Rundi (Kirundi),5850,2110,0,8590 +Non-Official & Non-Aboriginal languages,Russian,188255,116595,4855,269645 +Aboriginal languages,"Salish languages, n.i.e.",260,25,0,560 +Aboriginal languages,Sarsi (Sarcee),80,10,0,145 +Non-Official & Non-Aboriginal languages,Scottish Gaelic,1090,190,15,3980 +Aboriginal languages,Sekani,85,15,0,185 +Non-Official & Non-Aboriginal languages,"Semitic languages, n.i.e.",2150,1205,65,3220 +Non-Official & Non-Aboriginal languages,Serbian,57350,31750,530,73780 +Non-Official & Non-Aboriginal languages,Serbo-Croatian,9550,3890,30,11275 +Non-Official & Non-Aboriginal languages,Shona,3185,1035,0,5430 +Aboriginal languages,Shuswap (Secwepemctsin),445,50,35,1305 +Non-Official & Non-Aboriginal languages,"Sign languages, n.i.e",4125,6690,645,22280 +Non-Official & Non-Aboriginal languages,Sindhi,11860,4975,35,20260 +Non-Official & Non-Aboriginal languages,Sinhala (Sinhalese),16335,7790,40,27825 +Aboriginal languages,"Siouan languages, n.i.e.",55,20,0,140 +Aboriginal languages,"Slavey, n.o.s.",280,105,10,675 +Non-Official & Non-Aboriginal languages,"Slavic languages, n.i.e.",2420,670,10,2995 +Non-Official & Non-Aboriginal languages,Slovak,17580,5610,100,21470 +Non-Official & Non-Aboriginal languages,Slovene (Slovenian),9785,2055,15,11490 +Non-Official & Non-Aboriginal languages,Somali,36755,22895,220,49660 +Aboriginal languages,South Slavey,945,370,35,1365 +Aboriginal languages,Southern East Cree,45,15,0,40 +Aboriginal languages,Southern Tutchone,70,5,0,145 +Non-Official & Non-Aboriginal languages,Spanish,458850,263505,13030,995260 +Aboriginal languages,Squamish,40,5,10,285 +Aboriginal languages,Stoney,3025,1950,240,3675 +Aboriginal languages,Straits,80,25,15,365 +Non-Official & Non-Aboriginal languages,Swahili,13370,5370,80,38685 +Aboriginal languages,Swampy Cree,1440,330,10,2350 +Non-Official & Non-Aboriginal languages,Swedish,6840,1050,125,14140 +Non-Official & Non-Aboriginal languages,"Tagalog (Pilipino, Filipino)",431385,213790,3450,612735 +Aboriginal languages,Tahltan,95,5,0,265 +Non-Official & Non-Aboriginal languages,"Tai-Kadai languages, n.i.e",85,30,0,115 +Non-Official & Non-Aboriginal languages,Tamil,140720,96955,2085,189860 +Non-Official & Non-Aboriginal languages,Telugu,15660,8280,40,23165 +Non-Official & Non-Aboriginal languages,Thai,9255,3365,525,15395 +Aboriginal languages,Thompson (Ntlakapamux),335,20,0,450 +Non-Official & Non-Aboriginal languages,Tibetan,6160,4590,50,7050 +Non-Official & Non-Aboriginal languages,"Tibeto-Burman languages, n.i.e.",1405,655,15,2380 +Non-Official & Non-Aboriginal languages,Tigrigna,16645,10205,130,21340 +Aboriginal languages,Tlingit,95,0,10,260 +Aboriginal languages,Tsimshian,200,30,10,410 +Non-Official & Non-Aboriginal languages,"Turkic languages, n.i.e.",1315,455,10,1875 +Non-Official & Non-Aboriginal languages,Turkish,32815,18955,690,50770 +Non-Official & Non-Aboriginal languages,Ukrainian,102485,28250,1210,132115 +Non-Official & Non-Aboriginal languages,"Uralic languages, n.i.e.",10,5,0,25 +Non-Official & Non-Aboriginal languages,Urdu,210815,128785,1495,322220 +Non-Official & Non-Aboriginal languages,Uyghur,1035,610,20,1390 +Non-Official & Non-Aboriginal languages,Uzbek,1720,995,15,2465 +Non-Official & Non-Aboriginal languages,Vietnamese,156430,104245,8075,198895 +Non-Official & Non-Aboriginal languages,Vlaams (Flemish),3895,355,35,4400 +Aboriginal languages,"Wakashan languages, n.i.e.",10,0,0,25 +Non-Official & Non-Aboriginal languages,Waray-Waray,1110,310,0,1395 +Non-Official & Non-Aboriginal languages,Welsh,1075,95,0,1695 +Non-Official & Non-Aboriginal languages,Wolof,3990,1385,10,8240 +Aboriginal languages,Woods Cree,1840,800,75,2665 +Non-Official & Non-Aboriginal languages,Wu (Shanghainese),12915,7650,105,16530 +Non-Official & Non-Aboriginal languages,Yiddish,13555,7085,895,20985 +Non-Official & Non-Aboriginal languages,Yoruba,9080,2615,15,22415 diff --git a/source/reading.md b/source/reading.md index 6e27e527..4febd2cd 100644 --- a/source/reading.md +++ b/source/reading.md @@ -15,16 +15,6 @@ kernelspec: (reading)= # Reading in data locally and from the web -We need to import the `pandas` package in order to read data into Python - -```{code-cell} ipython3 -import pandas as pd -``` - -```{code-cell} ipython3 -import warnings -warnings.filterwarnings('ignore') -``` ## Overview @@ -57,24 +47,22 @@ By the end of the chapter, readers will be able to do the following: - Read data into Python using an absolute path, relative path and a URL. - Compare and contrast the following functions: - `read_csv` - - `read_table` - `read_excel` -- Match the following `pandas` `.read_*` function arguments to their descriptions: +- Match the following `pandas` `read_csv` function arguments to their descriptions: - `filepath_or_buffer` - `sep` - `names` - `skiprows` - -- Choose the appropriate `pandas` `.read_*` function and function arguments to load a given plain text tabular data set into Python. +- Choose the appropriate `read_csv` function arguments to load a given plain text tabular data set into Python. - Use `pandas` package's `read_excel` function and arguments to load a sheet from an excel file into Python. -- Connect to a database using the `SQLAlchemy` library. -- List the tables in a database using `SQLAlchemy` library's `table_names` function -- Create a reference to a database table that is queriable using the `SQLAlchemy` library's `select` -and `where` functions -- Use `.to_csv` to save a data frame to a csv file -- (*Optional*) Obtain data using **a**pplication **p**rogramming **i**nterfaces (APIs) and web scraping. - - Read/scrape data from an internet URL using the `BeautifulSoup` package - - Compare downloading tabular data from a plain text file (e.g. *.csv) from the web versus scraping data from a .html file +- Connect to a database using the `ibis` library's `connect` function. +- List the tables in a database using the `ibis` library's `list_tables` function +- Create a reference to a database table using the `ibis` library's `table` function +- Execute queries to bring data from a database into Python using the `ibis` library's `execute` function +- Use `to_csv` to save a data frame to a `.csv` file +% - (*Optional*) Obtain data using **a**pplication **p**rogramming **i**nterfaces (APIs) and web scraping. +% - Read/scrape data from an internet URL using the `BeautifulSoup` package +% - Compare downloading tabular data from a plain text file (e.g. `.csv`) from the web versus scraping data from a `.html` file ## Absolute and relative file paths @@ -88,8 +76,7 @@ This chapter will discuss the different functions we can use to import data into Python, but before we can talk about *how* we read the data into Python with these functions, we first need to talk about *where* the data lives. When you load a data set into Python, you first need to tell Python where those files live. The file -could live on your computer (*local*) -or somewhere on the internet (*remote*). +could live on your computer (*local*) or somewhere on the internet (*remote*). The place where the file lives on your computer is called the "path". You can think of the path as directions to the file. There are two kinds of paths: @@ -110,16 +97,13 @@ below. ```{figure} img/filesystem.jpeg --- -height: 400px +height: 500px name: Filesystem --- Example file system ``` - - - **Reading `happiness_report.csv` using a relative path:** +++ @@ -148,11 +132,15 @@ folders between the computer's root `/` and the file) isn't usually the same across different computers. For example, suppose Fatima and Jayden are working on a project together on the `happiness_report.csv` data. Fatima's file is stored at -`/home/Fatima/project/data/happiness_report.csv`, +``` +/home/Fatima/project/data/happiness_report.csv +``` while Jayden's is stored at -`/home/Jayden/project/data/happiness_report.csv`. +``` +/home/Jayden/project/data/happiness_report.csv +``` Even though Fatima and Jayden stored their files in the same place on their computers (in their home folders), the absolute paths are different due to @@ -166,17 +154,17 @@ relative paths will work on both! ``` Your file could be stored locally, as we discussed, or it could also be -somewhere on the internet (remotely). A *Uniform Resource Locator (URL)* (web -address) indicates the location of a resource on the internet and -helps us retrieve that resource. Next, we will discuss how to get either -locally or remotely stored data into Python. +somewhere on the internet (remotely). For this purpose we use a +*Uniform Resource Locator (URL)*, i.e., a web address that looks something +like https://google.com/. URLs indicate the location of a resource on the internet and +helps us retrieve that resource. ## Reading tabular data from a plain text file into Python (readcsv)= -### `read_csv` to read in comma-separated files +### `read_csv` to read in comma-separated values files -```{index} csv, reading; delimiter, read function; read\_csv +```{index} csv, reading; separator, read function; read\_csv ``` Now that we have learned about *where* data could be, we will learn about *how* @@ -184,11 +172,11 @@ to import data into Python using various functions. Specifically, we will learn to *read* tabular data from a plain text file (a document containing only text) *into* Python and *write* tabular data to a file *out of* Python. The function we use to do this depends on the file's format. For example, in the last chapter, we learned about using -the `pandas` `read_csv` function when reading .csv (**c**omma-**s**eparated **v**alues) -files. In that case, the separator or *delimiter* that divided our columns was a +the `read_csv` function from `pandas` when reading `.csv` (**c**omma-**s**eparated **v**alues) +files. In that case, the *separator* that divided our columns was a comma (`,`). We only learned the case where the data matched the expected defaults of the `read_csv` function -(column names are present, and commas are used as the delimiter between columns). +(column names are present, and commas are used as the separator between columns). In this section, we will learn how to read files that do not satisfy the default expectations of `read_csv`. @@ -203,11 +191,9 @@ language data from the 2016 Canadian census. We put `data/` before the file's name when we are loading the data set because this data set is located in a sub-folder, named `data`, relative to where we are running our Python code. +Here is what the text in the file `data/can_lang.csv` looks like. -Here is what the file would look like in a plain text editor (a program that removes -all formatting, like bolding or different fonts): - -``` +```code category,language,mother_tongue,most_at_home,most_at_work,lang_known Aboriginal languages,"Aboriginal languages, n.o.s.",590,235,30,665 Non-Official & Non-Aboriginal languages,Afrikaans,10260,4785,85,23415 @@ -227,14 +213,15 @@ And here is a review of how we can use `read_csv` to load it into Python. First load the `pandas` package to gain access to useful functions for reading the data. +```{code-cell} ipython3 +import pandas as pd +``` + Next we use `read_csv` to load the data into Python, and in that call we specify the relative path to the file. ```{code-cell} ipython3 canlang_data = pd.read_csv("data/can_lang.csv") -``` - -```{code-cell} ipython3 canlang_data ``` @@ -242,90 +229,87 @@ canlang_data Oftentimes, information about how data was collected, or other relevant information, is included at the top of the data file. This information is -usually written in sentence and paragraph form, with no delimiter because it is +usually written in sentence and paragraph form, with no separator because it is not organized into columns. An example of this is shown below. This information gives the data scientist useful context and information about the data, however, it is not well formatted or intended to be read into a data frame cell along with the tabular data that follows later in the file. -``` +```code Data source: https://ttimbers.github.io/canlang/ Data originally published in: Statistics Canada Census of Population 2016. Reproduced and distributed on an as-is basis with their permission. category,language,mother_tongue,most_at_home,most_at_work,lang_known Aboriginal languages,"Aboriginal languages, n.o.s.",590,235,30,665 Non-Official & Non-Aboriginal languages,Afrikaans,10260,4785,85,23415 -Non-Official & Non-Aboriginal languages,"Afro-Asiatic languages, n.i.e.",1150,44 +Non-Official & Non-Aboriginal languages,"Afro-Asiatic languages, n.i.e.",1150,445,10,2775 Non-Official & Non-Aboriginal languages,Akan (Twi),13460,5985,25,22150 Non-Official & Non-Aboriginal languages,Albanian,26895,13135,345,31930 Aboriginal languages,"Algonquian languages, n.i.e.",45,10,0,120 Aboriginal languages,Algonquin,1260,370,40,2480 -Non-Official & Non-Aboriginal languages,American Sign Language,2685,3020,1145,21 +Non-Official & Non-Aboriginal languages,American Sign Language,2685,3020,1145,21930 Non-Official & Non-Aboriginal languages,Amharic,22465,12785,200,33670 ``` With this extra information being present at the top of the file, using `read_csv` as we did previously does not allow us to correctly load the data -into Python. In the case of this file we end up only reading in one column of the -data set: - -``` - -canlang_data = pd.read_csv("data/can_lang-meta-data.csv") -``` +into Python. In the case of this file, Python just prints a `ParserError` +message, indicating that it wasn't able to read the file. +```python +canlang_data = pd.read_csv("data/can_lang_meta-data.csv") ``` -ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 6 +```code +ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 6 ``` ```{index} Error ``` -> **Note:** In contrast to the normal and expected messages above, this time Python -> printed out a Parsing error for us indicating that there might be a problem with how -> our data is being read in. - ```{index} read function; skiprows argument ``` To successfully read data like this into Python, the `skiprows` argument can be useful to tell Python -how many lines to skip before +how many rows to skip before it should start reading in the data. In the example above, we would set this -value to 2 and pass `header` as None to read and load the data correctly. +value to 3 to read and load the data correctly. ```{code-cell} ipython3 -canlang_data = pd.read_csv("data/can_lang-meta-data.csv", skiprows=2, header=None) +canlang_data = pd.read_csv("data/can_lang_meta-data.csv", skiprows=3) canlang_data ``` -How did we know to skip two lines? We looked at the data! The first two lines +How did we know to skip three rows? We looked at the data! The first three rows of the data had information we didn't need to import: -``` -Source: Statistics Canada, Census of Population, 2016. Reproduced and distributed on an "as is" basis with the permission of Statistics Canada. -Date collected: 2020/07/09 +```code +Data source: https://ttimbers.github.io/canlang/ +Data originally published in: Statistics Canada Census of Population 2016. +Reproduced and distributed on an as-is basis with their permission. ``` -The column names began at line 3, so we skipped the first two lines. +The column names began at row 4, so we skipped the first three rows. -### `read_csv` with `sep` argument to read in tab-separated files +### Using the `sep` argument for different separators -Another common way data is stored is with tabs as the delimiter. Notice the +Another common way data is stored is with tabs as the separator. Notice the data file, `can_lang.tsv`, has tabs in between the columns instead of commas. +```code +category language mother_tongue most_at_home most_at_work lang_known +Aboriginal languages Aboriginal languages, n.o.s. 590 235 30 665 +Non-Official & Non-Aboriginal languages Afrikaans 10260 4785 85 23415 +Non-Official & Non-Aboriginal languages Afro-Asiatic languages, n.i.e. 1150 445 10 2775 +Non-Official & Non-Aboriginal languages Akan (Twi) 13460 5985 25 22150 +Non-Official & Non-Aboriginal languages Albanian 26895 13135 345 31930 +Aboriginal languages Algonquian languages, n.i.e. 45 10 0 120 +Aboriginal languages Algonquin 1260 370 40 2480 +Non-Official & Non-Aboriginal languages American Sign Language 2685 3020 1145 21930 +Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670 ``` -category language mother_tongue most_at_home most_at_work lang_kno -Aboriginal languages Aboriginal languages, n.o.s. 590 235 30 665 -Non-Official & Non-Aboriginal languages Afrikaans 10260 4785 85 23415 -Non-Official & Non-Aboriginal languages Afro-Asiatic languages, n.i.e. 1150 -Non-Official & Non-Aboriginal languages Akan (Twi) 13460 5985 25 22150 -Non-Official & Non-Aboriginal languages Albanian 26895 13135 345 31930 -Aboriginal languages Algonquian languages, n.i.e. 45 10 0 120 -Aboriginal languages Algonquin 1260 370 40 2480 -Non-Official & Non-Aboriginal languages American Sign Language 2685 3020 -Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670 +```{index} read function; sep argument ``` ```{index} see: tab-separated values; tsv @@ -334,109 +318,109 @@ Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670 ```{index} tsv, read function; read_tsv ``` -To read in this type of data, we can use the `read_csv` with `sep` argument -to read in .tsv (**t**ab **s**eparated **v**alues) files. +To read in `.tsv` (**t**ab **s**eparated **v**alues) files, we can set the `sep` argument +in the `read_csv` function to the *tab character* `\t`. -```{code-cell} ipython3 -canlang_data = pd.read_csv("data/can_lang.tsv", sep="\t", header=None) -canlang_data +```{index} escape character ``` -Let's compare the data frame here to the resulting data frame in Section -{ref}`readcsv` after using `read_csv`. Notice anything? They look the same! The -same number of columns/rows and column names! So we needed to use different -tools for the job depending on the file format and our resulting table -(`canlang_data`) in both cases was the same! +> **Note:** `\t` is an example of an *escaped character*, +> which always starts with a backslash (`\`). +> Escaped characters are used to represent non-printing characters +> (like the tab) or characters with special meanings (such as quotation marks). -### `read_table` as a more flexible method to get tabular data into Python -```{index} read function; read\_delim, reading; delimiter +```{code-cell} ipython3 +canlang_data = pd.read_csv("data/can_lang.tsv", sep="\t") +canlang_data ``` -`read_csv` and `read_csv` with argument `sep` are actually just special cases of the more general -`read_table` function. We can use -`read_table` to import both comma and tab-separated files (and more), we just -have to specify the delimiter. The `can_lang.tsv` is a different version of -this same data set with no column names and uses tabs as the delimiter -instead of commas. +Let's compare the data frame here to the resulting data frame in Section +{ref}`readcsv` after using `read_csv`. Notice anything? They look the same; they have +the same number of columns and rows, and have the same column names! +So even though we needed to use different +arguments depending on the file format, our resulting data frame +(`canlang_data`) in both cases was the same. -Here is how the file would look in a plain text editor: +### Using the `header` argument to handle missing column names -``` -Aboriginal languages Aboriginal languages, n.o.s. 590 235 30 665 -Non-Official & Non-Aboriginal languages Afrikaans 10260 4785 85 23415 -Non-Official & Non-Aboriginal languages Afro-Asiatic languages, n.i.e. 1150 -Non-Official & Non-Aboriginal languages Akan (Twi) 13460 5985 25 22150 -Non-Official & Non-Aboriginal languages Albanian 26895 13135 345 31930 -Aboriginal languages Algonquian languages, n.i.e. 45 10 0 120 -Aboriginal languages Algonquin 1260 370 40 2480 -Non-Official & Non-Aboriginal languages American Sign Language 2685 3020 -Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670 -Non-Official & Non-Aboriginal languages Arabic 419890 223535 5585 629055 +```{index} read function; header, reading; separator ``` -```{index} read function; sep argument -``` +The `can_lang_no_cols.tsv` file contains a slightly different version +of this data set, except with no column names, and tabs for separators. +Here is how the file looks in a text editor: -To get this into Python using the `read_table` function, we specify the first -argument as the path to the file (as done with `read_csv`), and then provide -values to the `sep` argument (here a -tab, which we represent by `"\t"`). +```code +Aboriginal languages Aboriginal languages, n.o.s. 590 235 30 665 +Non-Official & Non-Aboriginal languages Afrikaans 10260 4785 85 23415 +Non-Official & Non-Aboriginal languages Afro-Asiatic languages, n.i.e. 1150 445 10 2775 +Non-Official & Non-Aboriginal languages Akan (Twi) 13460 5985 25 22150 +Non-Official & Non-Aboriginal languages Albanian 26895 13135 345 31930 +Aboriginal languages Algonquian languages, n.i.e. 45 10 0 120 +Aboriginal languages Algonquin 1260 370 40 2480 +Non-Official & Non-Aboriginal languages American Sign Language 2685 3020 1145 21930 +Non-Official & Non-Aboriginal languages Amharic 22465 12785 200 33670 -```{index} escape character ``` -> **Note:** `\t` is an example of an *escaped character*, -> which always starts with a backslash (`\`). -> Escaped characters are used to represent non-printing characters -> (like the tab) or characters with special meanings (such as quotation marks). +Data frames in Python need to have column names. Thus if you read in data that +don't have column names, Python will assign names automatically. In this example, +Python assigns each column a name of `0, 1, 2, 3, 4, 5`. +To read this data into Python, we specify the first +argument as the path to the file (as done with `read_csv`), and then provide +values to the `sep` argument (here a tab, which we represent by `"\t"`), +and finally set `header = None` to tell `pandas` that the data file does not +contain its own column names. ```{code-cell} ipython3 -canlang_data = pd.read_csv("data/can_lang.tsv", - sep = "\t", - header = None) +canlang_data = pd.read_csv( + "data/can_lang_no_cols.tsv", + sep = "\t", + header = None +) canlang_data ``` -Data frames in Python need to have column names. Thus if you read in data that -don't have column names, Python will assign names automatically. In the example -above, Python assigns each column a name of `0, 1, 2, 3, 4, 5`. - ```{index} pandas.DataFrame; rename, pandas ``` -It is best to rename your columns to help differentiate between them -(e.g., `0, 1`, etc., are not very descriptive names and will make it more confusing as -you code). To rename your columns, you can use the `rename` function +It is best to rename your columns manually in this scenario. The current column names +(`0, 1`, etc.) are problematic for two reasons: first, because they not very descriptive names, which will make your analysis +confusing; and second, because your column names should generally be *strings*, but are currently *integers*. +To rename your columns, you can use the `rename` function from the [pandas package](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html#). -The argument of the `rename` function is `columns`, which is a dictionary, -where the keys are the old column names and values are the new column names. -We rename the old `0, 1, ..., 5` -columns in the `canlang_data` data frame to more descriptive names below, with the -`inplace` argument as `True`, so that the columns are renamed in place. +The argument of the `rename` function is `columns`, which takes a mapping between the old column names and the new column names. +In this case, we want to rename the old columns (`0, 1, ..., 5`) in the `canlang_data` data frame to more descriptive names. + +To specify the mapping, we create a *dictionary*: a Python object that represents +a mapping from *keys* to *values*. We can create a dictionary by using a pair of curly +braces `{ }`, and inside the braces placing pairs of `key : value` separated by commas. +Below, we create a dictionary called `col_map` that maps the old column names in `canlang_data` to new column +names, and then pass it to the `rename` function. ```{code-cell} ipython3 -canlang_data.rename(columns = {0:'category', - 1:'language', - 2:'mother_tongue', - 3:'most_at_home', - 4:'most_at_work', - 5:'lang_known'}, inplace = True) - - -canlang_data +col_map = { + 0 : "category", + 1 : "language", + 2 : "mother_tongue", + 3 : "most_at_home", + 4 : "most_at_work", + 5 : "lang_known" +} +canlang_data_renamed = canlang_data.rename(columns = col_map) +canlang_data_renamed ``` ```{index} read function; names argument ``` -The column names can also be assigned to the dataframe while reading it from the file by passing a -list of column names to the `names` argument. `read_csv` and `read_table` have a `names` argument, -whose default value is `[]`. +The column names can also be assigned to the data frame immediately upon reading it from the file by passing a +list of column names to the `names` argument in `read_csv`. ```{code-cell} ipython3 canlang_data = pd.read_csv( - "data/can_lang.tsv", + "data/can_lang_no_cols.tsv", sep="\t", header=None, names=[ @@ -456,10 +440,9 @@ canlang_data ```{index} URL; reading from ``` -We can also use `read_csv`, `read_table`(and related functions) -to read in data directly from a **U**niform **R**esource **L**ocator (URL) that -contains tabular data. Here, we provide the URL to -`read_*` as the path to the file instead of a path to a local file on our +We can also use `read_csv` to read in data directly from a **U**niform **R**esource **L**ocator (URL) that +contains tabular data. Here, we provide the URL to a remote file +as the path in `read_csv`, instead of a path to a local file on our computer. We need to surround the URL with quotes similar to when we specify a path on our local computer. All other arguments that we use are the same as when using these functions with a local file on our computer. @@ -474,12 +457,11 @@ canlang_data ### Previewing a data file before reading it into Python -In all the examples above, we gave you previews of the data file before we read +In many of the examples above, we gave you previews of the data file before we read it into Python. Previewing data is essential to see whether or not there are column -names, what the delimiters are, and if there are lines you need to skip. You -should do this yourself when trying to read in data files. You can preview -files in a plain text editor by right-clicking on the file, selecting "Open -With," and choosing a plain text editor (e.g., Notepad). +names, what the separators are, and if there are rows you need to skip. You +should do this yourself when trying to read in data files: open the file in whichever +text editor your prefer to inspect its contents prior to reading it into Python. ## Reading tabular data from a Microsoft Excel file @@ -505,7 +487,7 @@ files. Take a look at a snippet of what a `.xlsx` file would look like in a text +++ -``` +```code ,?'O _rels/.rels???J1??>E?{7? ?4'?|??hlIo??F @@ -537,11 +519,10 @@ canlang_data ``` If the `.xlsx` file has multiple sheets, you have to use the `sheet_name` argument -to specify the sheet number or name. You can also specify cell ranges using the -`usecols` argument(Example: `usecols="A:D"` for including cells from `A` to `D`). -This functionality is useful when a single sheet contains +to specify the sheet number or name. This functionality is useful when a single sheet contains multiple tables (a sad thing that happens to many Excel spreadsheets since this -makes reading in data more difficult). +makes reading in data more difficult). You can also specify cell ranges using the +`usecols` argument (e.g., `usecols="A:D"` for including columns from `A` to `D`). As with plain text files, you should always explore the data file before importing it into Python. Exploring the data beforehand helps you decide which @@ -549,35 +530,35 @@ arguments you need to load the data into Python successfully. If you do not have the Excel program on your computer, you can use other programs to preview the file. Examples include Google Sheets and Libre Office. -In {numref}`read_func` we summarize the `read_*` functions we covered -in this chapter. We also include the `read_csv2` function for data separated by +In {numref}`read_func` we summarize the `read_csv` and `read_excel` functions we covered +in this chapter. We also include the arguments for data separated by semicolons `;`, which you may run into with data sets where the decimal is represented by a comma instead of a period (as with some data sets from European countries). -```{list-table} Summary of read_* functions +```{list-table} Summary of read_csv and read_excel :header-rows: 1 :name: read_func * - Data File Type - Python Function - - Python Package + - Arguments * - Comma (`,`) separated files - `read_csv` - - `pandas` + - just the file path * - Tab (`\t`) separated files - - `read_csv` with `sep` argument - - `pandas` -* - Semicolon (`;`) separated files - - `read_csv` with `sep` argument - - `pandas` -* - Various formats (`.csv`, `.tsv`) - - `read_table` - - `pandas` + - `read_csv` + - `sep="\t"` +* - Missing header + - `read_csv` + - `header=None` +* - European-style numbers, semicolon (`;`) separators + - `read_csv` + - `sep=";"`, `thousands="."`, `decimal=","` * - Excel files (`.xlsx`) - `read_excel` - - `pandas` + - `sheet_name`, `usecols` ``` @@ -607,195 +588,186 @@ using Python with SQLite and PostgreSQL databases. SQLite is probably the simplest relational database system that one can use in combination with Python. SQLite databases are self-contained and usually stored and accessed locally on one computer. Data is usually stored in -a file with a `.db` extension. Similar to Excel files, these are not plain text -files and cannot be read in a plain text editor. +a file with a `.db` extension (or sometimes a `.sqlite` extension). +Similar to Excel files, these are not plain text files and cannot be read in a plain text editor. -```{index} database; connect, SQLAlchemy, SQLAlchemy; create_engine, database; SQLAlchemy +```{index} database; connect, ibis, ibis; ibis ``` -```{index} see: SQLAlchemy; database +```{index} see: ibis; database ``` The first thing you need to do to read data into Python from a database is to -connect to the database. We do that using the `create_engine` function from the -`sal` (SQLAlchemy) package. This does not read +connect to the database. For an SQLite database, we will do that using +the `connect` function from the +`sqlite` backend in the +`ibis` package. This command does not read in the data, but simply tells Python where the database is and opens up a communication channel that Python can use to send SQL commands to the database. +> **Note:** There is another database package in python called `sqlalchemy`. +> That package is a bit more mature than `ibis`, +> so if you want to dig deeper into working with databases in Python, that is a good next +> package to learn about. We will work with `ibis` in this book, as it +> provides a more modern and friendlier syntax that is more like `pandas` for data analysis code. + ```{code-cell} ipython3 -import sqlalchemy as sal -from sqlalchemy import MetaData, Table, create_engine, select +import ibis -db = sal.create_engine("sqlite:///data/can_lang.db") -conn = db.connect() +conn = ibis.sqlite.connect("data/can_lang.db") ``` -```{index} database; tables +```{index} database; tables; list_tables ``` Often relational databases have many tables; thus, in order to retrieve data from a database, you need to know the name of the table in which the data is stored. You can get the names of -all the tables in the database using the `table_names` +all the tables in the database using the `list_tables` function: ```{code-cell} ipython3 -tables = db.table_names() +tables = conn.list_tables() tables ``` -```{index} database; select, SQLAlchemy; select +```{index} database; table, ibis; table ``` -The `table_names` function returned only one name, which tells us +The `list_tables` function returned only one name---`"can_lang"`---which tells us that there is only one table in this database. To reference a table in the database (so that we can perform operations like selecting columns and filtering rows), we -use the `select` function from the `sqlalchemy` package. The object returned -by the `select` function allows us to work with data -stored in databases as if they were just regular data frames; but secretly, behind -the scenes, `sqlalchemy` is turning your function calls (e.g., `select`) -into SQL queries! To access the table in the database, we first declare the `metadata` of the table using -`sqlalchemy` package and then access the table using `select` function from `sqlalchemy` package. +use the `table` function from the `conn` object. The object returned +by the `table` function allows us to work with data +stored in databases as if they were just regular `pandas` data frames; but secretly, behind +the scenes, `ibis` will turn your commands into SQL queries! ```{code-cell} ipython3 -metadata = MetaData(bind=None) -table = Table( - 'can_lang', - metadata, - autoload=True, - autoload_with=db -) +canlang_table = conn.table("can_lang") +canlang_table +``` + +```{index} database; count, ibis; count ``` +Although it looks like we might have obtained the whole data frame from the database, we didn't! +It's a *reference*; the data is still stored only in the SQLite database. The `canlang_table` object +is an `AlchemyTable` (`ibis` is using `sqlalchemy` under the hood!), which, when printed, tells +you which columns are available in the table. But unlike a usual `pandas` data frame, +we do not immediately know how many rows are in the table. In order to find out how many +rows there are, we have to send an SQL *query* (i.e., command) to the data base. +In `ibis`, we can do that using the `count` function from the table object. + ```{code-cell} ipython3 -query = select([table]) -canlang_data_db = conn.execute(query) -canlang_data_db +canlang_table.count() ``` -```{index} database; fetchall, SQLAlchemy; fetchall +```{index} execute, ibis; execute ``` -Although it looks like we just got a data frame from the database, we didn't! -It's a *reference*; the data is still stored only in the SQLite database. The output -is a `CursorResult`(indicating that Python does not know how many rows -there are in total!) object. -In order to actually retrieve this data in Python, -we use the `fetchall()` function. The -`sqlalchemy` package works this way because databases are often more efficient at selecting, filtering -and joining large data sets than Python. And typically the database will not even +Wait a second...this isn't the number of rows in the database. In fact, we haven't actually sent our +SQL query to the database yet! We need to explicitly tell `ibis` when we want to send the query. +The reason for this is that databases are often more efficient at working with (i.e., selecting, filtering, +joining, etc.) large data sets than Python. And typically, the database will not even be stored on your computer, but rather a more powerful machine somewhere on the -web. So Python is lazy and waits to bring this data into memory until you explicitly -tell it to using the `fetchall` function. The `fetchall` function returns the -result of the query in the form of a list, where each row in the table is an element in the list. -Let's look at the first 10 rows in the table. +web. So `ibis` is lazy and waits to bring this data into memory until you explicitly +tell it to using the `execute` function. The `execute` function actually sends the SQL query +to the database, and gives you the result. Let's look at the number of rows in the table by executing +the `count` command. ```{code-cell} ipython3 -canlang_data_db = conn.execute(query).fetchall() -canlang_data_db[:10] +canlang_table.count().execute() ``` +There we go! There are 214 rows in the `can_lang` table. If you are interested in seeing +the *actual* text of the SQL query that `ibis` sends to the database, you can use the `compile` function +instead of `execute`. But note that you have to pass the result of `compile` to the `str` function to turn it into +a human-readable string first. -```{index} database; show query, SQLAlchemy; query.compile +```{index} compile, ibis; compile ``` -We can look at the SQL commands that are sent to the database when we write -`conn.execute(query).fetchall()` in Python with the `query.compile` function from the -`sqlalchemy` package. - ```{code-cell} ipython3 -compiled = query.compile(db, compile_kwargs={"render_postcompile": True}) - -print(str(compiled) % compiled.params) +str(canlang_table.count().compile()) ``` The output above shows the SQL code that is sent to the database. When we -write `conn.execute(query).fetchall()` in Python, in the background, the function is +write `canlang_table.count().execute()` in Python, in the background, the `execute` function is translating the Python code into SQL, sending that SQL to the database, and then translating the -response for us. So `sqlalchemy` does all the hard work of translating from Python to SQL and back for us; +response for us. So `ibis` does all the hard work of translating from Python to SQL and back for us; we can just stick with Python! -With our `canlang_data_db` table reference for the 2016 Canadian Census data in hand, we -can mostly continue onward as if it were a regular data frame. For example, -we can use the `select` function along with `where` function -to obtain only certain rows. Below we filter the data to include only Aboriginal languages using -the `where` function of `sqlalchemy` +The `ibis` package provides lots of `pandas`-like tools for working with database tables. +For example, we can look at the first few rows of the table by using the `head` function---and +we won't forget to `execute` to see the result! -```{index} database; filter data, SQLAlchemy; where +```{index} database; head, ibis; ``` ```{code-cell} ipython3 -query = select([table]).where(table.columns.category == 'Aboriginal languages') -result_proxy = conn.execute(query) -result_proxy +canlang_table.head(10).execute() ``` -```{index} database; fetchall, SQLAlchemy; fetchall -``` +You can see that `ibis` actually returned a `pandas` data frame to us after we executed the query, +which is very convenient for working with the data after getting it from the database. +So now that we have the `canlang_table` table reference for the 2016 Canadian Census data in hand, we +can mostly continue onward as if it were a regular data frame. For example, let's do the same exercise +from Chapter 1: we will obtain only those rows corresponding to Aboriginal languages, and keep only +the `language` and `mother_tongue` columns. +We can use the `[]` operation with a logical statement +to obtain only certain rows. Below we filter the data to include only Aboriginal languages. -Above you can again see that this data is not actually stored in Python yet: -the output is a `CursorResult`(indicating that Python does not know how many rows -there are in total!) object. -In order to actually retrieve this data in Python as a data frame, -we again use the `fetchall()` function. -Below you will see that after running `fetchall()`, Python knows that the retrieved -data has 67 rows, and there is no `CursorResult` object listed any more. We will display only the first 10 -rows of the table from the list returned by the query. +```{index} database; filter, ibis; +``` ```{code-cell} ipython3 -aboriginal_lang_data_db = result_proxy.fetchall() -aboriginal_lang_data_db[:10] +canlang_table_filtered = canlang_table[canlang_table["category"] == "Aboriginal languages"] +canlang_table_filtered ``` +Above you can see that we have not yet executed this command; `canlang_table_filtered` is just showing +the first part of our query (the part that starts with `Selection[r0]` above). +We didn't call `execute` because we are not ready to bring the data into Python yet. +We can still use the database to do some work to obtain *only* the small amount of data we want to work with locally +in Python. Let's add the second part of our SQL query: selecting only the `language` and `mother_tongue` columns. -`sqlalchemy` provides many more functions (not just `select`, `where`) -that you can use to directly feed the database reference (`aboriginal_lang_data_db`) into -downstream analysis functions (e.g., `altair` for data visualization). -But `sqlalchemy` does not provide *every* function that we need for analysis; -we do eventually need to call `fetchall`. - -```{index} pandas.DataFrame; shape +```{index} database; select, ibis; ``` -Does the result returned by `fetchall` function store it as a dataframe? Let's look -what happens when we try to use `shape` to count rows in a dataframe - +```{code-cell} ipython3 +canlang_table_selected = canlang_table_filtered[["language", "mother_tongue"]] +canlang_table_selected ``` -aboriginal_lang_data_db.shape +Now you can see that the `ibis` query will have two steps: it will first find rows corresponding to +Aboriginal languages, then it will extract only the `language` and `mother_tongue` columns that we are interested in. +Let's actually execute the query now to bring the data into Python as a `pandas` data frame, and print the result. +```{code-cell} ipython3 +aboriginal_lang_data = canlang_table_selected.execute() +aboriginal_lang_data ``` -``` -## AttributeError: 'list' object has no attribute 'shape' -``` +`ibis` provides many more functions (not just the `[]` operation) +that you can use to manipulate the data within the database before calling +`execute` to obtain the data in Python. But `ibis` does not provide *every* function +that we need for analysis; we do eventually need to call `execute`. +For example, `ibis` does not provide the `tail` function to look at the last +rows in a database, even though `pandas` does. ```{index} pandas.DataFrame; tail ``` -or `tail` to preview the last six rows of a data frame: - -``` -aboriginal_lang_data_db.tail(6) +```{code-cell} ipython3 +canlang_table_selected.tail(6) ``` +```{code-cell} ipython3 +aboriginal_lang_data.tail(6) ``` -## AttributeError: 'list' object has no attribute 'tail' -``` - -Oops! We cannot treat the result as a dataframe, hence we need to convert it -to a dataframe after calling `fetchall` function -```{code-cell} ipython3 -aboriginal_lang_data_db = pd.DataFrame(aboriginal_lang_data_db, columns=['category', 'language', 'mother_tongue', 'most_at_home', 'most_at_work', 'lang_known']) -aboriginal_lang_data_db.shape -``` - -> -> Additionally, some operations will not work to extract columns or single values -> from the reference. Thus, once you have finished -> your data wrangling of the database reference object, it is advisable to -> bring it into Python using `fetchall` and then converting it into the dataframe using `pandas` package. -> But be very careful using `fetchall`: databases are often *very* big, -> and reading an entire table into Python might take a long time to run or even possibly -> crash your machine. So make sure you use `where` and `select` on the database table -> to reduce the data to a reasonable size before using `fetchall` to read it into Python! +So once you have finished your data wrangling of the database reference object, it is advisable to +bring it into Python as a `pandas` data frame using the `execute` function. +But be very careful using `execute`: databases are often *very* big, +and reading an entire table into Python might take a long time to run or even possibly +crash your machine. So make sure you select and filter the database table +to reduce the data to a reasonable size before using `execute` to read it into Python! ### Reading data from a PostgreSQL database @@ -808,126 +780,98 @@ Unlike SQLite, PostgreSQL uses a client–server database engine, as it was designed to be used and accessed on a network. This means that you have to provide more information to Python when connecting to Postgres databases. The additional information that you -need to include when you call the `create_engine` function is listed below: +need to include when you call the `connect` function is listed below: -- `dbname`: the name of the database (a single PostgreSQL instance can host more than one database) -- `host`: the URL pointing to where the database is located +- `database`: the name of the database (a single PostgreSQL instance can host more than one database) +- `host`: the URL pointing to where the database is located (`localhost` if it is on your local machine) - `port`: the communication endpoint between Python and the PostgreSQL database (usually `5432`) - `user`: the username for accessing the database - `password`: the password for accessing the database -Additionally, we must use the `pgdb` package instead of `sqlalchemy` in the -`create_engine` function call. Below we demonstrate how to connect to a version of +Below we demonstrate how to connect to a version of the `can_mov_db` database, which contains information about Canadian movies. Note that the `host` (`fakeserver.stat.ubc.ca`), `user` (`user0001`), and `password` (`abc123`) below are *not real*; you will not actually be able to connect to a database using this information. -```{code-cell} ipython3 -pip install pgdb -``` - -``` -!pip install pgdb -import pgdb -import sqlalchemy -from sqlalchemy import create_engine - -# connection_str = "postgresql://:@:/" -connection_str = "postgresql://user0001:abc123@fakeserver.stat.ubc.ca:5432/can_mov_db" -db = create_engine(connection_str) -conn_mov_data = db.connect() - +```python +conn = ibis.postgres.connect( + database = "can_mov_db", + host = "fakeserver.stat.ubc.ca", + port = 5432, + user = "user0001", + password = "abc123" +) ``` -After opening the connection, everything looks and behaves almost identically -to when we were using an SQLite database in Python. For example, we can again use -`table_names` to find out what tables are in the `can_mov_db` database: +Aside from needing to provide that additional information, `ibis` makes it so +that connecting to and working with a Postgres database is identical to +connecting to and working with an SQLite database. For example, we can again use +`list_tables` to find out what tables are in the `can_mov_db` database: -``` -tables = conn_mov_data.table_names() -tables +```python +conn.list_tables() ``` - -``` +```text ['themes', 'medium', 'titles', 'title_aliases', 'forms', 'episodes', 'names', 'names_occupations', 'occupation', 'ratings'] - ``` We see that there are 10 tables in this database. Let's first look at the `"ratings"` table to find the lowest rating that exists in the `can_mov_db` -database. To access the table's contents we first need to declare the `metadata` of the table -and store it in a variable named `ratings`. Then, we can use the `select` function to -refer to the data in the table and return the result in python using `fetchall` function, just like -we did for the SQLite database. - -``` -metadata = MetaData(bind=None) -ratings = Table( - 'ratings', - metadata, - autoload=True, - autoload_with=db -) - -query = select([ratings]) -ratings_proxy = conn_mov_data.execute(query).fetchall() - -``` - +database. +```python +ratings_table = conn.table("ratings") +ratings_table ``` -[('The Grand Seduction', 6.6, 150), -('Rhymes for Young Ghouls', 6.3, 1685), -('Mommy', 7.5, 1060), -('Incendies', 6.1, 1101), -('Bon Cop, Bad Cop', 7.0, 894), -('Goon', 5.5, 1111), -('Monsieur Lazhar', 5.6,610), -('What if', 5.3, 1401), -('The Barbarian Invations', 5.8, 99 -('Away from Her', 6.9, 2311)] +```text +AlchemyTable: ratings + title string + average_rating float64 + num_votes int64 ``` -```{index} SQLAlchemy; select +```{index} ibis; select ``` To find the lowest rating that exists in the data base, we first need to -extract the `average_rating` column using `select`: +select the `average_rating` column: -``` -avg_rating_db = select([ratings.columns.average_rating]) -avg_rating_db +```python +avg_rating = ratings_table[["average_rating"]] +avg_rating ``` -``` +```text +r0 := AlchemyTable: ratings + title string + average_rating float64 + num_votes int64 -[(6.6,), - (6.3,), - (7.5,), - (6.1,), - (7.0,), - (5.5,), - (5.6,), - (5.4,), - (5.8,), - (6.9,)] +Selection[r0] + selections: + average_rating: r0.average_rating ``` -```{index} min +```{index} database; order_by, ibis; head, ibis; ibis ``` -Next we use `min` to find the minimum rating in that column: +Next we use the `order_by` function from `ibis` order the table by `average_rating`, +and then the `head` function to select the first row (i.e., the lowest score). -``` -min(avg_rating_db) +```python +lowest = avg_rating.order_by("average_rating").head(1) +lowest.execute() ``` +```{code-cell} ipython3 +:tags: ["remove-input"] +lowest = pd.DataFrame({"average_rating" : [1.0]}) +lowest ``` -(1.0,) -``` + We see the lowest rating given to a movie is 1, indicating that it must have been a really bad movie... @@ -937,14 +881,12 @@ been a really bad movie... ```{index} database; reasons to use ``` -Opening a database stored in a `.db` file -involved a lot more effort than just opening a `.csv`, or any of the -other plain text or Excel formats. It was a bit of a pain to use a database in -that setting since we had to use `sqlalchemy` to translate `pandas`-like -commands (`where`, `select`, etc.) into SQL commands that the database -understands. Not all `pandas` commands can currently be translated with -SQLite databases. For example, we can compute a mean with an SQLite database -but can't easily compute a median. So you might be wondering: why should we use +Opening a database involved a lot more effort than just opening a `.csv`, or any of the +other plain text or Excel formats. We had to open a connection to the database, +then use `ibis` to translate `pandas`-like +commands (the `[]` operation, `head`, etc.) into SQL queries that the database +understands, and then finally `execute` them. And not all `pandas` commands can currently be translated +via `ibis` into database queries. So you might be wondering: why should we use databases at all? Databases are beneficial in a large-scale setting: @@ -964,637 +906,638 @@ Databases are beneficial in a large-scale setting: ``` At the middle and end of a data analysis, we often want to write a data frame -that has changed (either through filtering, selecting, mutating or summarizing) +that has changed (through selecting columns, filtering rows, etc.) to a file to share it with others or use it for another step in the analysis. The most straightforward way to do this is to use the `to_csv` function from the `pandas` package. The default -arguments for this file are to use a comma (`,`) as the delimiter and include -column names. Below we demonstrate creating a new version of the Canadian -languages data set without the official languages category according to the +arguments are to use a comma (`,`) as the separator, and to include column names +in the first row. We also specify `index = False` to tell `pandas` not to print +row numbers in the `.csv` file. Below we demonstrate creating a new version of the Canadian +languages data set without the "Official languages" category according to the Canadian 2016 Census, and then writing this to a `.csv` file: ```{code-cell} ipython3 -no_official_lang_data = canlang_data[canlang_data['category'] != 'Official languages'] -no_official_lang_data.to_csv("data/no_official_languages.csv") -``` - -## Obtaining data from the web - -> **Note:** This section is not required reading for the remainder of the textbook. It -> is included for those readers interested in learning a little bit more about -> how to obtain different types of data from the web. - -```{index} see: application programming interface; API -``` - -```{index} API -``` - -Data doesn't just magically appear on your computer; you need to get it from -somewhere. Earlier in the chapter we showed you how to access data stored in a -plain text, spreadsheet-like format (e.g., comma- or tab-separated) from a web -URL using one of the `read_*` functions from the `pandas`. But as time goes -on, it is increasingly uncommon to find data (especially large amounts of data) -in this format available for download from a URL. Instead, websites now often -offer something known as an **a**pplication **p**rogramming **i**nterface -(API), which -provides a programmatic way to ask for subsets of a data set. This allows the -website owner to control *who* has access to the data, *what portion* of the -data they have access to, and *how much* data they can access. Typically, the -website owner will give you a *token* (a secret string of characters somewhat -like a password) that you have to provide when accessing the API. - -```{index} web scraping, CSS, HTML -``` - -```{index} see: hypertext markup language; HTML -``` - -```{index} see: cascading style sheet; CSS -``` - -Another interesting thought: websites themselves *are* data! When you type a -URL into your browser window, your browser asks the *web server* (another -computer on the internet whose job it is to respond to requests for the -website) to give it the website's data, and then your browser translates that -data into something you can see. If the website shows you some information that -you're interested in, you could *create* a data set for yourself by copying and -pasting that information into a file. This process of taking information -directly from what a website displays is called -*web scraping* (or sometimes *screen scraping*). Now, of course, copying and pasting -information manually is a painstaking and error-prone process, especially when -there is a lot of information to gather. So instead of asking your browser to -translate the information that the web server provides into something you can -see, you can collect that data programmatically—in the form of -**h**yper**t**ext **m**arkup **l**anguage -(HTML) -and **c**ascading **s**tyle **s**heet (CSS) code—and process it -to extract useful information. HTML provides the -basic structure of a site and tells the webpage how to display the content -(e.g., titles, paragraphs, bullet lists etc.), whereas CSS helps style the -content and tells the webpage how the HTML elements should -be presented (e.g., colors, layouts, fonts etc.). - -This subsection will show you the basics of both web scraping -with the [`BeautifulSoup` Python package](https://beautiful-soup-4.readthedocs.io/en/latest/) {cite:p}`beautifulsoup` -and accessing the Twitter API -using the [`tweepy` Python package](https://github.com/tweepy/tweepy) {cite:p}`tweepy`. - -+++ - -### Web scraping - -#### HTML and CSS selectors - -```{index} web scraping, HTML; selector, CSS; selector, Craiglist -``` - -When you enter a URL into your browser, your browser connects to the -web server at that URL and asks for the *source code* for the website. -This is the data that the browser translates -into something you can see; so if we -are going to create our own data by scraping a website, we have to first understand -what that data looks like! For example, let's say we are interested -in knowing the average rental price (per square foot) of the most recently -available one-bedroom apartments in Vancouver -on [Craiglist](https://vancouver.craigslist.org). When we visit the Vancouver Craigslist -website and search for one-bedroom apartments, -we should see something similar to {numref}`fig:craigslist-human`. - -+++ - -```{figure} img/craigslist_human.png -:name: fig:craigslist-human - -Craigslist webpage of advertisements for one-bedroom apartments. -``` - -+++ - -Based on what our browser shows us, it's pretty easy to find the size and price -for each apartment listed. But we would like to be able to obtain that information -using Python, without any manual human effort or copying and pasting. We do this by -examining the *source code* that the web server actually sent our browser to -display for us. We show a snippet of it below; the -entire source -is [included with the code for this book](https://github.com/UBC-DSCI/introduction-to-datascience-python/blob/main/source/img/website_source.txt): - -``` - - $800 - - - 1br - - - - (13768 108th Avenue) - - - map - - - - hide this posting - - - - restore - restore this posting - - - -

- -
  • - - $2285 - -``` - -Oof...you can tell that the source code for a web page is not really designed -for humans to understand easily. However, if you look through it closely, you -will find that the information we're interested in is hidden among the muck. -For example, near the top of the snippet -above you can see a line that looks like - -```html -$800 -``` - -That is definitely storing the price of a particular apartment. With some more -investigation, you should be able to find things like the date and time of the -listing, the address of the listing, and more. So this source code most likely -contains all the information we are interested in! - -```{index} HTML; tag -``` - -Let's dig into that line above a bit more. You can see that -that bit of code has an *opening tag* (words between `<` and `>`, like -``) and a *closing tag* (the same with a slash, like ``). HTML -source code generally stores its data between opening and closing tags like -these. Tags are keywords that tell the web browser how to display or format -the content. Above you can see that the information we want (`$800`) is stored -between an opening and closing tag (`` and ``). In the opening -tag, you can also see a very useful "class" (a special word that is sometimes -included with opening tags): `class="result-price"`. Since we want R to -programmatically sort through all of the source code for the website to find -apartment prices, maybe we can look for all the tags with the `"result-price"` -class, and grab the information between the opening and closing tag. Indeed, -take a look at another line of the source snippet above: - -```html -$2285 -``` - -It's yet another price for an apartment listing, and the tags surrounding it -have the `"result-price"` class. Wonderful! Now that we know what pattern we -are looking for—a dollar amount between opening and closing tags that have the -`"result-price"` class—we should be able to use code to pull out all of the -matching patterns from the source code to obtain our data. This sort of "pattern" -is known as a *CSS selector* (where CSS stands for **c**ascading **s**tyle **s**heet). - -The above was a simple example of "finding the pattern to look for"; many -websites are quite a bit larger and more complex, and so is their website -source code. Fortunately, there are tools available to make this process -easier. For example, -[SelectorGadget](https://selectorgadget.com/) is -an open-source tool that simplifies identifying the generating -and finding of CSS selectors. -At the end of the chapter in the additional resources section, we include a link to -a short video on how to install and use the SelectorGadget tool to -obtain CSS selectors for use in web scraping. -After installing and enabling the tool, you can click the -website element for which you want an appropriate selector. For -example, if we click the price of an apartment listing, we -find that SelectorGadget shows us the selector `.result-price` -in its toolbar, and highlights all the other apartment -prices that would be obtained using that selector ({numref}`fig:sg1`). - -```{figure} img/sg1.png -:name: fig:sg1 - -Using the SelectorGadget on a Craigslist webpage to obtain the CCS selector useful for obtaining apartment prices. -``` - -If we then click the size of an apartment listing, SelectorGadget shows us -the `span` selector, and highlights many of the lines on the page; this indicates that the -`span` selector is not specific enough to capture only apartment sizes ({numref}`fig:sg3`). - -```{figure} img/sg3.png -:name: fig:sg3 - -Using the SelectorGadget on a Craigslist webpage to obtain a CCS selector useful for obtaining apartment sizes. -``` - -To narrow the selector, we can click one of the highlighted elements that -we *do not* want. For example, we can deselect the "pic/map" links, -resulting in only the data we want highlighted using the `.housing` selector ({numref}`fig:sg2`). - -```{figure} img/sg2.png -:name: fig:sg2 - -Using the SelectorGadget on a Craigslist webpage to refine the CCS selector to one that is most useful for obtaining apartment sizes. -``` - -So to scrape information about the square footage and rental price -of apartment listings, we need to use -the two CSS selectors `.housing` and `.result-price`, respectively. -The selector gadget returns them to us as a comma-separated list (here -`.housing , .result-price`), which is exactly the format we need to provide to -Python if we are using more than one CSS selector. - -**Stop! Are you allowed to scrape that website?** - -```{index} web scraping; permission -``` - -+++ - -*Before* scraping data from the web, you should always check whether or not -you are *allowed* to scrape it! There are two documents that are important -for this: the `robots.txt` file and the Terms of Service -document. If we take a look at [Craigslist's Terms of Service document](https://www.craigslist.org/about/terms.of.use), -we find the following text: *"You agree not to copy/collect CL content -via robots, spiders, scripts, scrapers, crawlers, or any automated or manual equivalent (e.g., by hand)."* -So unfortunately, without explicit permission, we are not allowed to scrape the website. - -```{index} Wikipedia -``` - -What to do now? Well, we *could* ask the owner of Craigslist for permission to scrape. -However, we are not likely to get a response, and even if we did they would not likely give us permission. -The more realistic answer is that we simply cannot scrape Craigslist. If we still want -to find data about rental prices in Vancouver, we must go elsewhere. -To continue learning how to scrape data from the web, let's instead -scrape data on the population of Canadian cities from Wikipedia. -We have checked the [Terms of Service document](https://foundation.wikimedia.org/wiki/Terms_of_Use/en), -and it does not mention that web scraping is disallowed. -We will use the SelectorGadget tool to pick elements that we are interested in -(city names and population counts) and deselect others to indicate that we are not -interested in them (province names), as shown in {numref}`fig:sg4`. - -```{figure} img/selectorgadget-wiki-updated.png -:name: fig:sg4 - -Using the SelectorGadget on a Wikipedia webpage. -``` - -We include a link to a short video tutorial on this process at the end of the chapter -in the additional resources section. SelectorGadget provides in its toolbar -the following list of CSS selectors to use: - -+++ - -``` -td:nth-child(8) , -td:nth-child(6) , -td:nth-child(4) , -.mw-parser-output div tr+ tr td:nth-child(2) -``` - -+++ - -Now that we have the CSS selectors that describe the properties of the elements -that we want to target (e.g., has a tag name `price`), we can use them to find -certain elements in web pages and extract data. - -+++ - -**Using `pandas.read_html`** - -+++ - -The easiest way to read a table from HTML is to use [`pandas.read_html`](https://pandas.pydata.org/docs/reference/api/pandas.read_html.html). We can see that the Wikipedia page of "Canada" has 18 tables. - -```{code-cell} ipython3 -:tags: [remove-output] - -canada_wiki = pd.read_html("https://en.wikipedia.org/wiki/Canada") -len(canada_wiki) -``` - -``` -18 -``` - -+++ - -With some inspection, we find that the table that shows the population of the most populated provinces is of index 1. - -```{code-cell} ipython3 -:tags: [remove-output] - -df = canada_wiki[1] -df.columns = df.columns.droplevel() -df -``` - -```{code-cell} ipython3 -:tags: [remove-input] - -df = pd.read_csv("data/canada-wiki-read_html.csv", index_col=0) -df -``` - -**Using `BeautifulSoup`** - -```{index} BeautifulSoup, requests -``` - -Now that we have our CSS selectors we can use the `requests` and `BeautifulSoup` Python packages to scrape our desired data from the website. We start by loading the packages: - -```{code-cell} ipython3 -import requests -from bs4 import BeautifulSoup -``` - -Next, we tell Python what page we want to scrape by providing the webpage's URL in quotations to the function `requests.get` and pass it into the `BeautifulSoup` function for parsing: - -```{code-cell} ipython3 -wiki = requests.get("https://en.wikipedia.org/wiki/Canada") -page = BeautifulSoup(wiki.content, "html.parser") -``` - -The `requests.get` function sends a `GET` request to the specified URL and returns the server's response to the HTTP request (*i.e.* a `requests.Response` object). The `BeautifulSoup` function takes the content of the response and returns the HTML source code itself, which we have -stored in the `page` variable. Next, we use the `select` method of the page object along with the CSS selectors we obtained from the SelectorGadget tool. Make sure to surround the selectors with quotation marks; `select` expects that -argument is a string. It selects *nodes* from the HTML document that -match the CSS selectors you specified. A *node* is an HTML tag pair (e.g., -`` and `` which defines the cell of a table) combined with the content -stored between the tags. For our CSS selector `td:nth-child(6)`, an example -node that would be selected would be: - -+++ - -``` - -London - -``` - -+++ - -We store the result of the `select` function in the `population_nodes` variable. Note that it returns a list, and we slice the list to only print the first 5 elements. - -```{code-cell} ipython3 -:tags: [remove-output] - -population_nodes = page.select( - "td:nth-child(8) , td:nth-child(6) , td:nth-child(4) , .mw-parser-output div td:nth-child(2)" -) -population_nodes[:5] -``` - -``` -[Toronto, - 6,202,225, - London, - 543,551 - , - Montreal] -``` - -+++ - -Next we extract the meaningful data—in other words, we get rid of the HTML code syntax and tags—from -the nodes using the `get_text` -function. In the case of the example -node above, `get_text` function returns `"London"`. - -```{code-cell} ipython3 -:tags: [remove-output] - -[row.get_text() for row in population_nodes][:5] -``` - -``` -['Toronto', '6,202,225', 'London', '543,551\n', 'Montreal'] -``` - -+++ - -Fantastic! We seem to have extracted the data of interest from the -raw HTML source code. But we are not quite done; the data -is not yet in an optimal format for data analysis. Both the city names and -population are encoded as characters in a single vector, instead of being in a -data frame with one character column for city and one numeric column for -population (like a spreadsheet). -Additionally, the populations contain commas (not useful for programmatically -dealing with numbers), and some even contain a line break character at the end -(`\n`). In Chapter {ref}`wrangling`, we will learn more about how to *wrangle* data -such as this into a more useful format for data analysis using Python. - -+++ - -### Using an API - -```{index} API -``` - -Rather than posting a data file at a URL for you to download, many websites these days -provide an API that must be accessed through a programming language like Python. The benefit of this -is that data owners have much more control over the data they provide to users. However, unlike -web scraping, there is no consistent way to access an API across websites. Every website typically -has its own API designed especially for its own use case. Therefore we will just provide one example -of accessing data through an API in this book, with the hope that it gives you enough of a basic -idea that you can learn how to use another API if needed. - -```{index} API; tweepy, tweepy, Twitter, API; token -``` - -+++ - -In particular, in this book we will show you the basics of how to use -the `tweepy` package in Python to access -data from the Twitter API. `tweepy` requires the [Twitter Developer Portal](https://developer.twitter.com/en/portal/dashboard) and you will need to get tokens and secrets from that, through which your access to the data will then be authenticated and controlled. - -+++ - -First, we go to the [Twitter Developer Portal](https://developer.twitter.com/en/portal/dashboard) and sign up an account if you do not have one yet. Note that you will need a valid phone number to associate with your developer account. After filling out the basic information, we will get the *essential access* to the Twitter API. Then we can create an app and hit the "get key" button, and we will get the API key and API key secret of the app (along with the bearer token which will not be used in this demonstration). **We need to store the key and secret at a safe place, and make sure do not show them to anyone else (also do not accidentally push it to the GitHub repository).** If you lose the key, you can always regenerate it. Next, we go to the "Keys and tokens" tab of the app, and generate an access token and an access token secret. **Save the access token and the access token secret at a safe place as well.** Your app will look something like {numref}`fig:twitter-API-keys-tokens`. - -+++ - -```{figure} img/twitter-API-keys-tokens.png -:name: fig:twitter-API-keys-tokens - -Generating the API key-secret pair and the access token-secret pair in Twitter API. -``` - -+++ - -Once you get the access keys and secrets, you can follow along with the examples that we show here. -To get started, load the `tweepy` package and authenticate our access to the Twitter developer portal account. - -```{code-cell} ipython3 -:tags: [remove-output] - -import tweepy - -# replace these with the api key, api key secret, access token and access token secret -# generated on your own -api_key = "8OxHWiIWjy8M39LvnC8OfSXrj" -api_key_secret = "scqjRqX5stoy4pYB5Zu52tCBKzhGLDh5nRqTEM6CMoLRkRLR8F" - -access_token = "1556029189484007425-mYwaDCI1WnCxjuMt0jb2UYD2ns8BYB" -access_token_secret = "pDG4Ta7giYLY3mablPhd6y9bB5y2Aer1Cn18rihIJFBB7" - -# Authenticate to Twitter -auth = tweepy.OAuthHandler(api_key, api_key_secret) -auth.set_access_token(access_token, access_token_secret) - -api = tweepy.API(auth) - -try: - api.verify_credentials() - print("Successful Authentication") -except: - print("Failed authentication") -``` - -``` -Successful Authentication -``` - -+++ - -`tweepy` provides an extensive set of functions to search -Twitter for tweets, users, their followers, and more. -Let's construct a small data set of the last 200 tweets and -retweets from the [@scikit_learn](https://twitter.com/scikit_learn) account. A few of the most recent tweets -are shown in {numref}`fig:01-scikit-learn-twitter`. - -+++ - -```{figure} img/scikit-learn-twitter.png -:name: fig:01-scikit-learn-twitter - -The `scikit-learn` account Twitter feed. -``` - -+++ - -**Stop! Think about your API usage carefully!** - -When you access an API, you are initiating a transfer of data from a web server -to your computer. Web servers are expensive to run and do not have infinite resources. -If you try to ask for *too much data* at once, you can use up a huge amount of the server's bandwidth. -If you try to ask for data *too frequently*—e.g., if you -make many requests to the server in quick succession—you can also bog the server down and make -it unable to talk to anyone else. Most servers have mechanisms to revoke your access if you are not -careful, but you should try to prevent issues from happening in the first place by being extra careful -with how you write and run your code. You should also keep in mind that when a website owner -grants you API access, they also usually specify a limit (or *quota*) of how much data you can ask for. -Be careful not to overrun your quota! In this example, we should take a look at - [the Twitter website](https://developer.twitter.com/en/docs/twitter-api/rate-limits) to see what limits -we should abide by when using the API. - -+++ - -**Using `tweepy`** - -After checking the Twitter website, it seems like asking for 200 tweets one time is acceptable. -So we can use the `user_timeline` function to ask for the last 200 tweets from the [@scikit_learn](https://twitter.com/scikit_learn) account. - -```{code-cell} ipython3 -:tags: [remove-output] - -userID = "scikit_learn" - -scikit_learn_tweets = api.user_timeline( - screen_name=userID, - count=200, - include_rts=True, - tweet_mode="extended", -) -``` - -Let's take a look at the first 3 most recent tweets of [@scikit_learn](https://twitter.com/scikit_learn) through accessing the attributes of tweet data dictionary: - -```{code-cell} ipython3 -:tags: [remove-output] - -for info in scikit_learn_tweets[:3]: - print("ID: {}".format(info.id)) - print(info.created_at) - print(info.full_text) - print("\n") -``` - -``` -ID: 1555686128971403265 -2022-08-05 22:44:11+00:00 -scikit-learn 1.1.2 is out on https://t.co/lSpi4eDc2t and conda-forge! - -This is a small maintenance release that fixes a couple of regressions: -https://t.co/Oa84ES0qpG - - -ID: 1549321048943988737 -2022-07-19 09:11:37+00:00 -RT @MarenWestermann: @scikit_learn It is worth highlighting that this scikit-learn sprint is seeing the highest participation of women out… - - -ID: 1548339716465930244 -2022-07-16 16:12:09+00:00 -@StefanieMolin @theBodlina @RichardKlima We continue pulling requests here in Dublin. Putting some Made in Ireland code in the scikit-learn codebase 🇮🇪 . Current stats: 18 PRs opened, 12 merged 🚀 https://t.co/ccWy8vh8YI -``` - -+++ - -A full list of available attributes provided by Twitter API can be found [here](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet). - -+++ - -For the demonstration purpose, let's only use a -few variables of interest: `created_at`, `user.screen_name`, `retweeted`, -and `full_text`, and construct a `pandas` DataFrame using the extracted information. - -```{code-cell} ipython3 -:tags: [remove-output] - -columns = ["time", "user", "is_retweet", "text"] -data = [] -for tweet in scikit_learn_tweets: - data.append( - [tweet.created_at, tweet.user.screen_name, tweet.retweeted, tweet.full_text] - ) - -scikit_learn_tweets_df = pd.DataFrame(data, columns=columns) -scikit_learn_tweets_df -``` - -```{code-cell} ipython3 -:tags: [remove-input] - -scikit_learn_tweets_df = pd.read_csv("data/reading_api_df.csv", index_col=0) -scikit_learn_tweets_df -``` - -If you look back up at the image of the [@scikit_learn](https://twitter.com/scikit_learn) Twitter page, you will -recognize the text of the most recent few tweets in the above data frame. In -other words, we have successfully created a small data set using the Twitter -API—neat! This data is also quite different from what we obtained from web scraping; -the extracted information can be easily converted into a `pandas` data frame (although not *every* API will provide data in such a nice format). -From this point onward, the `scikit_learn_tweets_df` data frame is stored on your -machine, and you can play with it to your heart's content. For example, you can use -`pandas.to_csv` to save it to a file and `pandas.read_csv` to read it into Python again later; -and after reading the next few chapters you will have the skills to -compute the percentage of retweets versus tweets, find the most oft-retweeted -account, make visualizations of the data, and much more! If you decide that you want -to ask the Twitter API for more data -(see [the `tweepy` page](https://github.com/tweepy/tweepy) -for more examples of what is possible), just be mindful as usual about how much -data you are requesting and how frequently you are making requests. - -+++ +no_official_lang_data = canlang_data[canlang_data["category"] != "Official languages"] +no_official_lang_data.to_csv("data/no_official_languages.csv", index=False) +``` + +% ## Obtaining data from the web +% +% > **Note:** This section is not required reading for the remainder of the textbook. It +% > is included for those readers interested in learning a little bit more about +% > how to obtain different types of data from the web. +% +% ```{index} see: application programming interface; API +% ``` +% +% ```{index} API +% ``` +% +% Data doesn't just magically appear on your computer; you need to get it from +% somewhere. Earlier in the chapter we showed you how to access data stored in a +% plain text, spreadsheet-like format (e.g., comma- or tab-separated) from a web +% URL using one of the `read_*` functions from the `pandas`. But as time goes +% on, it is increasingly uncommon to find data (especially large amounts of data) +% in this format available for download from a URL. Instead, websites now often +% offer something known as an **a**pplication **p**rogramming **i**nterface +% (API), which +% provides a programmatic way to ask for subsets of a data set. This allows the +% website owner to control *who* has access to the data, *what portion* of the +% data they have access to, and *how much* data they can access. Typically, the +% website owner will give you a *token* (a secret string of characters somewhat +% like a password) that you have to provide when accessing the API. +% +% ```{index} web scraping, CSS, HTML +% ``` +% +% ```{index} see: hypertext markup language; HTML +% ``` +% +% ```{index} see: cascading style sheet; CSS +% ``` +% +% Another interesting thought: websites themselves *are* data! When you type a +% URL into your browser window, your browser asks the *web server* (another +% computer on the internet whose job it is to respond to requests for the +% website) to give it the website's data, and then your browser translates that +% data into something you can see. If the website shows you some information that +% you're interested in, you could *create* a data set for yourself by copying and +% pasting that information into a file. This process of taking information +% directly from what a website displays is called +% *web scraping* (or sometimes *screen scraping*). Now, of course, copying and pasting +% information manually is a painstaking and error-prone process, especially when +% there is a lot of information to gather. So instead of asking your browser to +% translate the information that the web server provides into something you can +% see, you can collect that data programmatically—in the form of +% **h**yper**t**ext **m**arkup **l**anguage +% (HTML) +% and **c**ascading **s**tyle **s**heet (CSS) code—and process it +% to extract useful information. HTML provides the +% basic structure of a site and tells the webpage how to display the content +% (e.g., titles, paragraphs, bullet lists etc.), whereas CSS helps style the +% content and tells the webpage how the HTML elements should +% be presented (e.g., colors, layouts, fonts etc.). +% +% This subsection will show you the basics of both web scraping +% with the [`BeautifulSoup` Python package](https://beautiful-soup-4.readthedocs.io/en/latest/) {cite:p}`beautifulsoup` +% and accessing the Twitter API +% using the [`tweepy` Python package](https://github.com/tweepy/tweepy) {cite:p}`tweepy`. +% +% +++ +% +% ### Web scraping +% +% #### HTML and CSS selectors +% +% ```{index} web scraping, HTML; selector, CSS; selector, Craiglist +% ``` +% +% When you enter a URL into your browser, your browser connects to the +% web server at that URL and asks for the *source code* for the website. +% This is the data that the browser translates +% into something you can see; so if we +% are going to create our own data by scraping a website, we have to first understand +% what that data looks like! For example, let's say we are interested +% in knowing the average rental price (per square foot) of the most recently +% available one-bedroom apartments in Vancouver +% on [Craiglist](https://vancouver.craigslist.org). When we visit the Vancouver Craigslist +% website and search for one-bedroom apartments, +% we should see something similar to {numref}`fig:craigslist-human`. +% +% +++ +% +% ```{figure} img/craigslist_human.png +% :name: fig:craigslist-human +% +% Craigslist webpage of advertisements for one-bedroom apartments. +% ``` +% +% +++ +% +% Based on what our browser shows us, it's pretty easy to find the size and price +% for each apartment listed. But we would like to be able to obtain that information +% using Python, without any manual human effort or copying and pasting. We do this by +% examining the *source code* that the web server actually sent our browser to +% display for us. We show a snippet of it below; the +% entire source +% is [included with the code for this book](https://github.com/UBC-DSCI/introduction-to-datascience-python/blob/main/source/img/website_source.txt): +% +% ```html +% +% $800 +% +% +% 1br - +% +% +% (13768 108th Avenue) +% +% +% map +% +% +% +% hide this posting +% +% +% +% restore +% restore this posting +% +% +% +%

    +%
  • +%
  • +% +% $2285 +% +% ``` +% +% Oof...you can tell that the source code for a web page is not really designed +% for humans to understand easily. However, if you look through it closely, you +% will find that the information we're interested in is hidden among the muck. +% For example, near the top of the snippet +% above you can see a line that looks like +% +% ```html +% $800 +% ``` +% +% That is definitely storing the price of a particular apartment. With some more +% investigation, you should be able to find things like the date and time of the +% listing, the address of the listing, and more. So this source code most likely +% contains all the information we are interested in! +% +% ```{index} HTML; tag +% ``` +% +% Let's dig into that line above a bit more. You can see that +% that bit of code has an *opening tag* (words between `<` and `>`, like +% ``) and a *closing tag* (the same with a slash, like ``). HTML +% source code generally stores its data between opening and closing tags like +% these. Tags are keywords that tell the web browser how to display or format +% the content. Above you can see that the information we want (`$800`) is stored +% between an opening and closing tag (`` and ``). In the opening +% tag, you can also see a very useful "class" (a special word that is sometimes +% included with opening tags): `class="result-price"`. Since we want R to +% programmatically sort through all of the source code for the website to find +% apartment prices, maybe we can look for all the tags with the `"result-price"` +% class, and grab the information between the opening and closing tag. Indeed, +% take a look at another line of the source snippet above: +% +% ```html +% $2285 +% ``` +% +% It's yet another price for an apartment listing, and the tags surrounding it +% have the `"result-price"` class. Wonderful! Now that we know what pattern we +% are looking for—a dollar amount between opening and closing tags that have the +% `"result-price"` class—we should be able to use code to pull out all of the +% matching patterns from the source code to obtain our data. This sort of "pattern" +% is known as a *CSS selector* (where CSS stands for **c**ascading **s**tyle **s**heet). +% +% The above was a simple example of "finding the pattern to look for"; many +% websites are quite a bit larger and more complex, and so is their website +% source code. Fortunately, there are tools available to make this process +% easier. For example, +% [SelectorGadget](https://selectorgadget.com/) is +% an open-source tool that simplifies identifying the generating +% and finding of CSS selectors. +% At the end of the chapter in the additional resources section, we include a link to +% a short video on how to install and use the SelectorGadget tool to +% obtain CSS selectors for use in web scraping. +% After installing and enabling the tool, you can click the +% website element for which you want an appropriate selector. For +% example, if we click the price of an apartment listing, we +% find that SelectorGadget shows us the selector `.result-price` +% in its toolbar, and highlights all the other apartment +% prices that would be obtained using that selector ({numref}`fig:sg1`). +% +% ```{figure} img/sg1.png +% :name: fig:sg1 +% +% Using the SelectorGadget on a Craigslist webpage to obtain the CCS selector useful for obtaining apartment prices. +% ``` +% +% If we then click the size of an apartment listing, SelectorGadget shows us +% the `span` selector, and highlights many of the lines on the page; this indicates that the +% `span` selector is not specific enough to capture only apartment sizes ({numref}`fig:sg3`). +% +% ```{figure} img/sg3.png +% :name: fig:sg3 +% +% Using the SelectorGadget on a Craigslist webpage to obtain a CCS selector useful for obtaining apartment sizes. +% ``` +% +% To narrow the selector, we can click one of the highlighted elements that +% we *do not* want. For example, we can deselect the "pic/map" links, +% resulting in only the data we want highlighted using the `.housing` selector ({numref}`fig:sg2`). +% +% ```{figure} img/sg2.png +% :name: fig:sg2 +% +% Using the SelectorGadget on a Craigslist webpage to refine the CCS selector to one that is most useful for obtaining apartment sizes. +% ``` +% +% So to scrape information about the square footage and rental price +% of apartment listings, we need to use +% the two CSS selectors `.housing` and `.result-price`, respectively. +% The selector gadget returns them to us as a comma-separated list (here +% `.housing , .result-price`), which is exactly the format we need to provide to +% Python if we are using more than one CSS selector. +% +% **Stop! Are you allowed to scrape that website?** +% +% ```{index} web scraping; permission +% ``` +% +% +++ +% +% *Before* scraping data from the web, you should always check whether or not +% you are *allowed* to scrape it! There are two documents that are important +% for this: the `robots.txt` file and the Terms of Service +% document. If we take a look at [Craigslist's Terms of Service document](https://www.craigslist.org/about/terms.of.use), +% we find the following text: *"You agree not to copy/collect CL content +% via robots, spiders, scripts, scrapers, crawlers, or any automated or manual equivalent (e.g., by hand)."* +% So unfortunately, without explicit permission, we are not allowed to scrape the website. +% +% ```{index} Wikipedia +% ``` +% +% What to do now? Well, we *could* ask the owner of Craigslist for permission to scrape. +% However, we are not likely to get a response, and even if we did they would not likely give us permission. +% The more realistic answer is that we simply cannot scrape Craigslist. If we still want +% to find data about rental prices in Vancouver, we must go elsewhere. +% To continue learning how to scrape data from the web, let's instead +% scrape data on the population of Canadian cities from Wikipedia. +% We have checked the [Terms of Service document](https://foundation.wikimedia.org/wiki/Terms_of_Use/en), +% and it does not mention that web scraping is disallowed. +% We will use the SelectorGadget tool to pick elements that we are interested in +% (city names and population counts) and deselect others to indicate that we are not +% interested in them (province names), as shown in {numref}`fig:sg4`. +% +% ```{figure} img/selectorgadget-wiki-updated.png +% :name: fig:sg4 +% +% Using the SelectorGadget on a Wikipedia webpage. +% ``` +% +% We include a link to a short video tutorial on this process at the end of the chapter +% in the additional resources section. SelectorGadget provides in its toolbar +% the following list of CSS selectors to use: +% +% +++ +% +% ```code +% td:nth-child(8) , +% td:nth-child(6) , +% td:nth-child(4) , +% .mw-parser-output div tr+ tr td:nth-child(2) +% ``` +% +% +++ +% +% Now that we have the CSS selectors that describe the properties of the elements +% that we want to target (e.g., has a tag name `price`), we can use them to find +% certain elements in web pages and extract data. +% +% +++ +% +% **Using `pandas.read_html`** +% +% +++ +% +% The easiest way to read a table from HTML is to use [`pandas.read_html`](https://pandas.pydata.org/docs/reference/api/pandas.read_html.html). We can see that the Wikipedia page of "Canada" has 18 tables. +% +% ```{code-cell} ipython3 +% :tags: [remove-output] +% +% canada_wiki = pd.read_html("https://en.wikipedia.org/wiki/Canada") +% len(canada_wiki) +% ``` +% +% ``` +% 18 +% ``` +% +% +++ +% +% With some inspection, we find that the table that shows the population of the most populated provinces is of index 1. +% +% ```{code-cell} ipython3 +% :tags: [remove-output] +% +% df = canada_wiki[1] +% df.columns = df.columns.droplevel() +% df +% ``` +% +% ```{code-cell} ipython3 +% :tags: [remove-input] +% +% df = pd.read_csv("data/canada-wiki-read_html.csv", index_col=0) +% df +% ``` +% +% **Using `BeautifulSoup`** +% +% ```{index} BeautifulSoup, requests +% ``` +% +% Now that we have our CSS selectors we can use the `requests` and `BeautifulSoup` Python packages to scrape our desired data from the website. We start by loading the packages: +% +% ```{code-cell} ipython3 +% import requests +% from bs4 import BeautifulSoup +% ``` +% +% Next, we tell Python what page we want to scrape by providing the webpage's URL in quotations to the function `requests.get` and pass it into the `BeautifulSoup` function for parsing: +% +% ```{code-cell} ipython3 +% wiki = requests.get("https://en.wikipedia.org/wiki/Canada") +% page = BeautifulSoup(wiki.content, "html.parser") +% ``` +% +% The `requests.get` function sends a `GET` request to the specified URL and returns the server's response to the HTTP request (*i.e.* a `requests.Response` object). The `BeautifulSoup` function takes the content of the response and returns the HTML source code itself, which we have +% stored in the `page` variable. Next, we use the `select` method of the page object along with the CSS selectors we obtained from the SelectorGadget tool. Make sure to surround the selectors with quotation marks; `select` expects that +% argument is a string. It selects *nodes* from the HTML document that +% match the CSS selectors you specified. A *node* is an HTML tag pair (e.g., +% `` and `` which defines the cell of a table) combined with the content +% stored between the tags. For our CSS selector `td:nth-child(6)`, an example +% node that would be selected would be: +% +% +++ +% +% ``` +% +% London +% +% ``` +% +% +++ +% +% We store the result of the `select` function in the `population_nodes` variable. Note that it returns a list, and we slice the list to only print the first 5 elements. +% +% ```{code-cell} ipython3 +% :tags: [remove-output] +% +% population_nodes = page.select( +% "td:nth-child(8) , td:nth-child(6) , td:nth-child(4) , .mw-parser-output div td:nth-child(2)" +% ) +% population_nodes[:5] +% ``` +% +% ``` +% [Toronto, +% 6,202,225, +% London, +% 543,551 +% , +% Montreal] +% ``` +% +% +++ +% +% Next we extract the meaningful data—in other words, we get rid of the HTML code syntax and tags—from +% the nodes using the `get_text` +% function. In the case of the example +% node above, `get_text` function returns `"London"`. +% +% ```{code-cell} ipython3 +% :tags: [remove-output] +% +% [row.get_text() for row in population_nodes][:5] +% ``` +% +% ``` +% ['Toronto', '6,202,225', 'London', '543,551\n', 'Montreal'] +% ``` +% +% +++ +% +% Fantastic! We seem to have extracted the data of interest from the +% raw HTML source code. But we are not quite done; the data +% is not yet in an optimal format for data analysis. Both the city names and +% population are encoded as characters in a single vector, instead of being in a +% data frame with one character column for city and one numeric column for +% population (like a spreadsheet). +% Additionally, the populations contain commas (not useful for programmatically +% dealing with numbers), and some even contain a line break character at the end +% (`\n`). In Chapter {ref}`wrangling`, we will learn more about how to *wrangle* data +% such as this into a more useful format for data analysis using Python. +% +% +++ +% +% ### Using an API +% +% ```{index} API +% ``` +% +% Rather than posting a data file at a URL for you to download, many websites these days +% provide an API that must be accessed through a programming language like Python. The benefit of this +% is that data owners have much more control over the data they provide to users. However, unlike +% web scraping, there is no consistent way to access an API across websites. Every website typically +% has its own API designed especially for its own use case. Therefore we will just provide one example +% of accessing data through an API in this book, with the hope that it gives you enough of a basic +% idea that you can learn how to use another API if needed. +% +% ```{index} API; tweepy, tweepy, Twitter, API; token +% ``` +% +% +++ +% +% In particular, in this book we will show you the basics of how to use +% the `tweepy` package in Python to access +% data from the Twitter API. `tweepy` requires the [Twitter Developer Portal](https://developer.twitter.com/en/portal/dashboard) and you will need to get tokens and secrets from that, through which your access to the data will then be authenticated and controlled. +% +% +++ +% +% First, we go to the [Twitter Developer Portal](https://developer.twitter.com/en/portal/dashboard) and sign up an account if you do not have one yet. Note that you will need a valid phone number to associate with your developer account. After filling out the basic information, we will get the *essential access* to the Twitter API. Then we can create an app and hit the "get key" button, and we will get the API key and API key secret of the app (along with the bearer token which will not be used in this demonstration). **We need to store the key and secret at a safe place, and make sure do not show them to anyone else (also do not accidentally push it to the GitHub repository).** If you lose the key, you can always regenerate it. Next, we go to the "Keys and tokens" tab of the app, and generate an access token and an access token secret. **Save the access token and the access token secret at a safe place as well.** Your app will look something like {numref}`fig:twitter-API-keys-tokens`. +% +% +++ +% +% ```{figure} img/twitter-API-keys-tokens.png +% :name: fig:twitter-API-keys-tokens +% +% Generating the API key-secret pair and the access token-secret pair in Twitter API. +% ``` +% +% +++ +% +% Once you get the access keys and secrets, you can follow along with the examples that we show here. +% To get started, load the `tweepy` package and authenticate our access to the Twitter developer portal account. +% +% ```{code-cell} ipython3 +% :tags: [remove-output] +% +% import tweepy +% +% # replace these with the api key, api key secret, access token and access token secret +% # generated on your own +% api_key = "8OxHWiIWjy8M39LvnC8OfSXrj" +% api_key_secret = "scqjRqX5stoy4pYB5Zu52tCBKzhGLDh5nRqTEM6CMoLRkRLR8F" +% +% access_token = "1556029189484007425-mYwaDCI1WnCxjuMt0jb2UYD2ns8BYB" +% access_token_secret = "pDG4Ta7giYLY3mablPhd6y9bB5y2Aer1Cn18rihIJFBB7" +% +% # Authenticate to Twitter +% auth = tweepy.OAuthHandler(api_key, api_key_secret) +% auth.set_access_token(access_token, access_token_secret) +% +% api = tweepy.API(auth) +% +% try: +% api.verify_credentials() +% print("Successful Authentication") +% except: +% print("Failed authentication") +% ``` +% +% ``` +% Successful Authentication +% ``` +% +% +++ +% +% `tweepy` provides an extensive set of functions to search +% Twitter for tweets, users, their followers, and more. +% Let's construct a small data set of the last 200 tweets and +% retweets from the [@scikit_learn](https://twitter.com/scikit_learn) account. A few of the most recent tweets +% are shown in {numref}`fig:01-scikit-learn-twitter`. +% +% +++ +% +% ```{figure} img/scikit-learn-twitter.png +% :name: fig:01-scikit-learn-twitter +% +% The `scikit-learn` account Twitter feed. +% ``` +% +% +++ +% +% **Stop! Think about your API usage carefully!** +% +% When you access an API, you are initiating a transfer of data from a web server +% to your computer. Web servers are expensive to run and do not have infinite resources. +% If you try to ask for *too much data* at once, you can use up a huge amount of the server's bandwidth. +% If you try to ask for data *too frequently*—e.g., if you +% make many requests to the server in quick succession—you can also bog the server down and make +% it unable to talk to anyone else. Most servers have mechanisms to revoke your access if you are not +% careful, but you should try to prevent issues from happening in the first place by being extra careful +% with how you write and run your code. You should also keep in mind that when a website owner +% grants you API access, they also usually specify a limit (or *quota*) of how much data you can ask for. +% Be careful not to overrun your quota! In this example, we should take a look at +% [the Twitter website](https://developer.twitter.com/en/docs/twitter-api/rate-limits) to see what limits +% we should abide by when using the API. +% +% +++ +% +% **Using `tweepy`** +% +% After checking the Twitter website, it seems like asking for 200 tweets one time is acceptable. +% So we can use the `user_timeline` function to ask for the last 200 tweets from the [@scikit_learn](https://twitter.com/scikit_learn) account. +% +% ```{code-cell} ipython3 +% :tags: [remove-output] +% +% userID = "scikit_learn" +% +% scikit_learn_tweets = api.user_timeline( +% screen_name=userID, +% count=200, +% include_rts=True, +% tweet_mode="extended", +% ) +% ``` +% +% Let's take a look at the first 3 most recent tweets of [@scikit_learn](https://twitter.com/scikit_learn) through accessing the attributes of tweet data dictionary: +% +% ```{code-cell} ipython3 +% :tags: [remove-output] +% +% for info in scikit_learn_tweets[:3]: +% print("ID: {}".format(info.id)) +% print(info.created_at) +% print(info.full_text) +% print("\n") +% ``` +% +% ``` +% ID: 1555686128971403265 +% 2022-08-05 22:44:11+00:00 +% scikit-learn 1.1.2 is out on https://t.co/lSpi4eDc2t and conda-forge! +% +% This is a small maintenance release that fixes a couple of regressions: +% https://t.co/Oa84ES0qpG +% +% +% ID: 1549321048943988737 +% 2022-07-19 09:11:37+00:00 +% RT @MarenWestermann: @scikit_learn It is worth highlighting that this scikit-learn sprint is seeing the highest participation of women out… +% +% +% ID: 1548339716465930244 +% 2022-07-16 16:12:09+00:00 +% @StefanieMolin @theBodlina @RichardKlima We continue pulling requests here in Dublin. Putting some Made in Ireland code in the scikit-learn codebase 🇮🇪 . Current stats: 18 PRs opened, 12 merged 🚀 https://t.co/ccWy8vh8YI +% ``` +% +% +++ +% +% A full list of available attributes provided by Twitter API can be found [here](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet). +% +% +++ +% +% For the demonstration purpose, let's only use a +% few variables of interest: `created_at`, `user.screen_name`, `retweeted`, +% and `full_text`, and construct a `pandas` DataFrame using the extracted information. +% +% ```{code-cell} ipython3 +% :tags: [remove-output] +% +% columns = ["time", "user", "is_retweet", "text"] +% data = [] +% for tweet in scikit_learn_tweets: +% data.append( +% [tweet.created_at, tweet.user.screen_name, tweet.retweeted, tweet.full_text] +% ) +% +% scikit_learn_tweets_df = pd.DataFrame(data, columns=columns) +% scikit_learn_tweets_df +% ``` +% +% ```{code-cell} ipython3 +% :tags: [remove-input] +% +% scikit_learn_tweets_df = pd.read_csv("data/reading_api_df.csv", index_col=0) +% scikit_learn_tweets_df +% ``` +% +% If you look back up at the image of the [@scikit_learn](https://twitter.com/scikit_learn) Twitter page, you will +% recognize the text of the most recent few tweets in the above data frame. In +% other words, we have successfully created a small data set using the Twitter +% API—neat! This data is also quite different from what we obtained from web scraping; +% the extracted information can be easily converted into a `pandas` data frame (although not *every* API will provide data in such a nice format). +% From this point onward, the `scikit_learn_tweets_df` data frame is stored on your +% machine, and you can play with it to your heart's content. For example, you can use +% `pandas.to_csv` to save it to a file and `pandas.read_csv` to read it into Python again later; +% and after reading the next few chapters you will have the skills to +% compute the percentage of retweets versus tweets, find the most oft-retweeted +% account, make visualizations of the data, and much more! If you decide that you want +% to ask the Twitter API for more data +% (see [the `tweepy` page](https://github.com/tweepy/tweepy) +% for more examples of what is possible), just be mindful as usual about how much +% data you are requesting and how frequently you are making requests. +% +% +++ ## Exercises Practice exercises for the material covered in this chapter can be found in the accompanying -[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-worksheets#readme) +[worksheets repository](https://github.com/UBC-DSCI/data-science-a-first-intro-python-worksheets#readme) in the "Reading in data locally and from the web" row. You can launch an interactive version of the worksheet in your browser by clicking the "launch binder" button. You can also preview a non-interactive version of the worksheet by clicking "view worksheet."