Help
Difference between revisions of "SPARQL (intermediate)"
Help:SPARQL 2 will explore federated queries fetching data from both LinguaguaLibre and Wikidata's endpoints. It allows to augent your data thank to Wikidata-provided languages population, status, countries, as well as speakers' geocoordinates, country of origin, etc.
Line 845: | Line 845: | ||
=== ✅ Speakers → Countries with most speakers === | === ✅ Speakers → Countries with most speakers === | ||
− | To run on LLQS.<ref name="LLQS" /> | + | To run on LLQS.<ref name="LLQS" /> Note: this queries collects ''declared'' speakers, and do not check for actual recordings. |
{| style="width:100%" | {| style="width:100%" | ||
|- style="vertical-align:top;" | |- style="vertical-align:top;" |
Revision as of 09:24, 27 February 2024
Tools
SPARQL to persitent data
Some SPARQL queries are meaningful but heavy and overly slow. This administrator tool stores or updates the response data on LinguaLibre, within a wikipage. Stored data can then be loaded in <0.1 second. Multiple data can also be merged via a common property if any.
Federated queries
- To query Lingualibre from Wikidata, use
SERVICE <https://lingualibre.org/sparql>
. - To query Wikidata from LinguaLibre, use
SERVICE <https://query.wikidata.org/sparql>
. - To query Commons from Lingualibre, use
SERVICE <https://commons-query.wikimedia.org/sparql>
.
Retrieve data of LinguaLibre from Wikidata
To run on WDQS.[1] It lists the existing levels in LinguaLibre.
#defaultEndpoint:Wikidata
PREFIX prop: <https://lingualibre.org/prop/direct/>
PREFIX entity: <https://lingualibre.org/entity/>
SELECT * {
SERVICE <https://lingualibre.org/sparql> {
SELECT
?item
?itemLabel
{
?item prop:P2 entity:Q5.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
}
}
|
|
From LinguaLibre Query service, retrieve Wikidata's data
- To run on LLQS.[2]
- Federate SPARQL query example to create.
# [Example to complete]
Notable elements
LinguaLibre endpoint | Wikidata endpoint | |
---|---|---|
|
For languages:
|
For countries:
For places:
|
Languages
✅ Language () → List of WD languages with speaker population >80M
To run on WDQS.[1]
#defaultEndpoint:Wikidata
SELECT DISTINCT ?item ?ISO ?itemLabel WHERE {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
?item p:P1098 ?statement0.
?item p:P219 ?isolang.
?statement0 (psv:P1098/wikibase:quantityAmount) ?numericQuantity.
FILTER(?numericQuantity > "80000000"^^xsd:decimal)
MINUS {
?item p:P31 ?statement1.
?statement1 (ps:P31/(wdt:P279*)) wd:Q25295.
}
OPTIONAL { ?item wdt:P218 ?ISO. }
}
ORDER BY ASC (?ISO)
LIMIT 100
|
|
Language () → List of LL languages with wd speaker population
- Query to create.
✅ Languages → List of Sign languages on wikidata
To run on WDQS.[1]
#defaultEndpoint:Wikidata
SELECT DISTINCT ?item ?itemLabel
WHERE {
?item p:P31 ?statement0.
?statement0 (ps:P31/(wdt:P279*)) wd:Q34228.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
}
|
|
✅ Languages → List of whistled languages on wikidata
To run on WDQS.[1]
#defaultEndpoint:Wikidata
SELECT ?whistled_language ?whistled_languageLabel
WHERE {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
?whistled_language wdt:P31 wd:Q2737212. # Is whistled language (Q2737212)
}
|
|
✅ Languages → List of LL languages with wikidata dead or extinct status
To run on LLQS.[2]
#defaultEndpoint:Lingualibre
SELECT
?deadLanguageLinguaLibre
?deadLanguageLinguaLibreLabel
?audios
# List Wikidata dead/extinct languages
WITH {
SELECT DISTINCT ?deadLanguage {
SERVICE <https://query.wikidata.org/sparql> {
{ ?deadLanguage wdt:P31/wdt:P279* wd:Q45762. }
UNION
{ ?deadLanguage wdt:P31/wdt:P279* wd:Q38058796. }
}
}
} AS %deadLanguage
# Compare with LinguaLibre languages, keep when P12 `wikidata` matches
WITH {
SELECT ?deadLanguageLinguaLibre {
INCLUDE %deadLanguage.
BIND(REPLACE(STR(?deadLanguage), '.*/', '') AS ?deadLanguageQid)
?deadLanguageLinguaLibre
prop:P2 entity:Q4;
prop:P12 ?deadLanguageQid.
}
} AS %deadLanguageLinguaLibre
# For those Lingualibre languages, count audios into ?audios
WITH {
SELECT
?deadLanguageLinguaLibre
(COUNT(?audio) AS ?audios)
{
INCLUDE %deadLanguageLinguaLibre.
?audio
prop:P2 entity:Q2;
prop:P4 ?deadLanguageLinguaLibre.
}
GROUP BY ?deadLanguageLinguaLibre
} AS %audios
# What is that ? Back to main scope ? :
{
INCLUDE %deadLanguageLinguaLibre.
OPTIONAL{ INCLUDE %audios. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY DESC(?audios)
|
|
✅ Languages → List of LL languages with wikidata Endangered language status
To run on LLQS.[2]
#defaultEndpoint:Lingualibre
SELECT
?endangeredLanguageLinguaLibre
?endangeredLanguageLinguaLibreLabel
?audios
# List Wikidata endangered languages
WITH {
SELECT DISTINCT ?endangeredLanguage {
SERVICE <https://query.wikidata.org/sparql> {
{ ?endangeredLanguage wdt:P31/wdt:P279* wd:Q335214. }
UNION
{ ?endangeredLanguage wdt:P31/wdt:P279* wd:Q83365366. }
}
}
} AS %endangeredLanguage
# Compare with LinguaLibre languages, keep when P12 `wikidata` matches
WITH {
SELECT ?endangeredLanguageLinguaLibre {
INCLUDE %endangeredLanguage.
BIND(REPLACE(STR(?endangeredLanguage), '.*/', '') AS ?endangeredLanguageQid)
?endangeredLanguageLinguaLibre
prop:P2 entity:Q4;
prop:P12 ?endangeredLanguageQid.
}
} AS %endangeredLanguageLinguaLibre
# For those Lingualibre languages, count audios into ?audios
WITH {
SELECT
?endangeredLanguageLinguaLibre
(COUNT(?audio) AS ?audios)
{
INCLUDE %endangeredLanguageLinguaLibre.
?audio
prop:P2 entity:Q2;
prop:P4 ?endangeredLanguageLinguaLibre.
}
GROUP BY ?endangeredLanguageLinguaLibre
} AS %audios
# What is that ? Back to main scope ? :
{
INCLUDE %endangeredLanguageLinguaLibre.
OPTIONAL{ INCLUDE %audios. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY DESC(?audios)
|
|
✅ Languages → List of LL languages with wikidata Sign language status
To run on LLQS.[2]
#defaultEndpoint:Lingualibre
SELECT
?signLanguageLinguaLibre
?signLanguageLinguaLibreLabel
?audios
# List Wikidata sign languages
WITH {
SELECT DISTINCT ?signLanguage {
SERVICE <https://query.wikidata.org/sparql> {
{ ?signLanguage wdt:P31/wdt:P279* wd:Q34228. }
}
}
} AS %signLanguage
# Compare with LinguaLibre languages, keep when P12 `wikidata` matches
WITH {
SELECT ?signLanguageLinguaLibre {
INCLUDE %signLanguage.
BIND(REPLACE(STR(?signLanguage), '.*/', '') AS ?signLanguageQid)
?signLanguageLinguaLibre
prop:P2 entity:Q4;
prop:P12 ?signLanguageQid.
}
} AS %signLanguageLinguaLibre
# For those Lingualibre languages, count audios into ?audios
WITH {
SELECT
?signLanguageLinguaLibre
(COUNT(?audio) AS ?audios)
{
INCLUDE %signLanguageLinguaLibre.
?audio
prop:P2 entity:Q2;
prop:P4 ?signLanguageLinguaLibre.
}
GROUP BY ?signLanguageLinguaLibre
} AS %audios
# What is that ? Back to main scope ? :
{
INCLUDE %signLanguageLinguaLibre.
OPTIONAL{ INCLUDE %audios. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY DESC(?audios)
|
|
Language (Marathi (Q34)) → Item with Wikidata Qid(s), optional Geo-coordinates
To run on LLQS.[2]
#defaultEndpoint:Lingualibre
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX ll: <https://lingualibre.org/entity/>
PREFIX llt: <https://lingualibre.org/prop/direct/>
PREFIX lltn: <https://lingualibre.org/prop/direct-normalized/>
select distinct ?record ?transcription ?languageLabel ?wdQid (?wdLabel as ?wdLabelEN) ?coord
where {
?record llt:P2 ll:Q2 . # Filter: P2 'instance of' is Q2 'record'
?record llt:P4 ll:Q34 . # Filter: record's P4 'language' is Q34 'Marathi'
?record llt:P4 ?language . # Assign value: record's P4 'language' to variable ?language
?record llt:P7 ?transcription . # Assign value: record's P7 'transcription' to variable ?transcription
?record lltn:P12 ?wdQid . # Assign value: record's P12 'wikidata id' to variable ?wdQid
SERVICE <https://query.wikidata.org/sparql> {
OPTIONAL { ?wdQid wdt:P625 ?coord . } # Assign value: wikidata item's wd:P625 'coordinates' to variable ?coord
OPTIONAL {
?wdQid rdfs:label ?wdLabel . # Assign value: wikidata item's label to variable ?wikidataLabel
FILTER (LANG(?wdLabel) = "en") . # Filter: default language, else English
}
}
SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
}
}
|
|
✅ Languages → Languages with gender and recordings counts
To run on LLQS.[2]
#defaultEndpoint:Lingualibre
SELECT ?languageLabel ?wikidata ?iso
?malesSpeakers ?malesRecords ?femalesSpeakers ?femalesRecords
(ROUND(1000*?femalesRecords/(?femalesRecords+?malesRecords))/10 AS ?percent)
WITH {
SELECT ?language ?languageLabel ?wikidata ?iso {
?record prop:P2 entity:Q2 . # Filter: P2 'instance of' is Q2 'record'
?record prop:P4 ?language . # Assign value: P4 'language' into ?language
?language prop:P12 ?wikidata . # Assign value: P12 'wikidata id' into ?wikidata
OPTIONAL { ?language prop:P13 ?iso . } # Assign value: P13 'iso639-3' into ?iso
}
GROUP BY ?language ?languageLabel ?wikidata ?iso
} AS %base
WITH {
SELECT ?language ?languageLabel ?iso ?genderLabel
(COUNT(DISTINCT ?females) AS ?femalesSpeakers)
(COUNT(DISTINCT ?record) AS ?femalesRecords) {
INCLUDE %base
?record prop:P4 ?language ; # Filter
prop:P5 ?females . # Assign value: P5 'speaker' into ?females
?females prop:P8 entity:Q17 ; # Filter
prop:P8 ?gender . # Assign value: P8 'gender' into ?gender
} GROUP BY ?language ?languageLabel ?iso ?genderLabel
} AS %females
WITH {
SELECT ?language ?languageLabel ?iso ?genderLabel
(COUNT(DISTINCT ?males) AS ?malesSpeakers)
(COUNT(DISTINCT ?record) AS ?malesRecords) {
INCLUDE %base
?record prop:P4 ?language ;
prop:P5 ?males . # Assign value: P5 'speaker' into variable ?speakerQid
?males prop:P8 entity:Q16 ;
prop:P8 ?gender .
} GROUP BY ?language ?languageLabel ?iso ?genderLabel
} AS %males
{
INCLUDE %base
INCLUDE %females
INCLUDE %males
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
GROUP BY ?languageLabel ?wikidata ?iso ?malesSpeakers ?malesRecords ?femalesSpeakers ?femalesRecords
ORDER BY ASC(?languageLabel )
|
|
[HEAVY] Languages → Languages with gender and recordings counts (2)
- Section needs review.
To run on LLQS.[2]
#defaultEndpoint:Lingualibre
SELECT ?iso
(?genderLabel as ?Gender)
(COUNT(DISTINCT ?speakerQid) as ?Speakers)
(COUNT(DISTINCT ?record) as ?Records)
WHERE {
?record prop:P2 entity:Q2 . # Filter: items where P2 'instance of' is Q2 'record'
?record prop:P4 ?language . # Filter: items where P4 'language' is Q34 'Marathi'
# OPTIONAL { ?language prop:P12 ?wikidata } # Assign value: P12 'wikidata id' into variable ?WD
OPTIONAL { ?language prop:P13 ?iso } # Assign value: P13 'iso639-3' into ?isoCode
?record prop:P5 ?speakerQid . # Assign value: P5 'speaker' into variable ?speakerQid
?speakerQid prop:P8 ?gender . # Assign value: P8 'sex or gender'
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
}
GROUP BY ?iso ?genderLabel
ORDER BY ASC(?iso )
|
|
Speakers
✅ Speakers → Largest number of languages recorded and known
To run on LLQS.[2]
#defaultEndpoint:Lingualibre
#Title: Speakers with recordings largest number of languages and known languages
SELECT ?speaker ?speakerLabel ?count ?languages
# Get audios, language, speaker triplet
WITH {
SELECT DISTINCT ?speaker ?language {
?audio prop:P4 ?language;
prop:P5 ?speaker.
}
} AS %speakers
# Get the count of languages per each speaker
WITH {
SELECT ?speaker (COUNT(?speaker) AS ?count) {
INCLUDE %speakers.
}
GROUP BY ?speaker
ORDER BY DESC(?count)
} AS %countOfLanguagesRecordedPerSpeaker
# Get the maximum number of languages per each speaker
WITH {
SELECT (MAX(?count) AS ?maxNumberOfLanguagesRecorded) {
INCLUDE %countOfLanguagesRecordedPerSpeaker.
}
} AS %maxNumberOfLanguagesRecorded
# Get those speakers whose count equals the maximum number of languages
WITH {
SELECT ?speaker ?count {
INCLUDE %countOfLanguagesRecordedPerSpeaker.
INCLUDE %maxNumberOfLanguagesRecorded.
FILTER(?count = ?maxNumberOfLanguagesRecorded).
}
} AS %speakersWithMostNumberOfLanguagesRecorded
# Get the languages of those speakers that have recorded audios in the
# most number of languages
WITH {
SELECT ?speaker (GROUP_CONCAT(?languageLabel; SEPARATOR = ", ") AS ?languages) {
INCLUDE %speakersWithMostNumberOfLanguagesRecorded.
?speaker prop:P4 [
rdfs:label ?languageLabel
]
FILTER(LANG(?languageLabel) = "en").
}
GROUP BY ?speaker
} AS %languagesOfSpeakersWithMostNumberOfLanguagesRecorded
{
INCLUDE %speakersWithMostNumberOfLanguagesRecorded.
INCLUDE %languagesOfSpeakersWithMostNumberOfLanguagesRecorded.
?speaker rdfs:label ?speakerLabel.
FILTER(LANG(?speakerLabel) = "en")
}
|
|
✅ Speakers → Countries with most speakers
To run on LLQS.[2] Note: this queries collects declared speakers, and do not check for actual recordings.
#defaultEndpoint:Lingualibre
SELECT ?country ?continentLabel ?ISO3 ?countryLabel (COUNT(?country) AS ?count)
WITH {
SELECT DISTINCT ?speaker {
?speaker prop:P2 entity:Q3;
}
} AS %speakers
WITH {
SELECT DISTINCT
?speaker
?country
?countryLabel
?ISO3
?continentLabel
{
INCLUDE %speakers.
?speaker prop:P14 ?residence.
# Avoids weird errors.
FILTER(REGEX(?residence, "^Q[0-9]+$"))
BIND(IRI(CONCAT('http://www.wikidata.org/entity/', ?residence)) AS ?residenceId)
# Get country from wikidata
SERVICE <https://query.wikidata.org/sparql> {
?residenceId wdt:P17 ?country.
?country rdfs:label ?countryLabel;
wdt:P298 ?ISO3;
wdt:P30 ?continent.
?continent rdfs:label ?continentLabel.
FILTER(LANG(?countryLabel) = "en").
FILTER(LANG(?continentLabel) = "en").
}
}
} AS %speakersWithCountries
{
INCLUDE %speakersWithCountries.
}
GROUP BY ?country ?continentLabel ?ISO3 ?countryLabel
ORDER BY DESC(?count)
|
|
Speakers → Map of speakers by place
To run on WDQS.[1]
#defaultEndpoint:Wikidata
#defaultView:Map
PREFIX ll: <https://lingualibre.org/entity/>
PREFIX llt: <https://lingualibre.org/prop/direct/>
SELECT DISTINCT ?lLabel ?coord WITH {
SELECT ?lLabel ?loc WHERE {
SERVICE <https://lingualibre.org/sparql> {
select DISTINCT ?lLabel ?loc {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
?l llt:P2 ll:Q3 ;
llt:P14 ?loc .
?record llt:P5 ?l.
FILTER (regex(?loc, '^Q'))
}
}
}
} AS %i
WHERE {
INCLUDE %i
BIND (URI(CONCAT("http://www.wikidata.org/entity/", ?loc)) AS ?locURL)
SERVICE <https://query.wikidata.org/sparql> {
select * {
?locURL wdt:P625 ?coord .
}
}
}
|
|
Speakers → Map of speakers with recordings
To run on WDQS.[1]
#defaultEndpoint:Wikidata
#defaultView:Map
# Same can apply to languages :
# Q3→Q4 : items speaker and language
# P14→P12 : items WD location and WD language
# P5→P4 : properties speaker and languages
# P625→P1098 : properties location and population
PREFIX ll: <https://lingualibre.org/entity/>
PREFIX llt: <https://lingualibre.org/prop/direct/>
SELECT DISTINCT ?itemLabel ?wikidata (?info AS ?coordinates) ?records (SAMPLE(?tag) AS ?layer)
# On Lingualibre, get item's info
WITH {
SELECT *
WHERE {
SERVICE <https://lingualibre.org/sparql> { # Commented on LLQS only
SELECT DISTINCT ?itemLabel ?wikidata (COUNT(?record) AS ?records)
{
?item llt:P2 ll:Q3 ;
rdfs:label ?itemLabel;
llt:P14 ?wikidata .
?record llt:P5 ?item .
FILTER (LANG(?itemLabel)='en')
FILTER (regex(?wikidata, '^Q'))
} GROUP BY ?itemLabel ?wikidata
} # Commented on LLQS only
}
} AS %infoWikidataId
# On Wikidata, get the target info
WHERE {
INCLUDE %infoWikidataId
BIND (URI(CONCAT("http://www.wikidata.org/entity/", ?wikidata)) AS ?infoURL)
SERVICE <https://query.wikidata.org/sparql> {
SELECT * {
?infoURL wdt:P625 ?info .
}
}
BIND(
IF(?records < 10, "<10",
IF(?records < 1000, "10-1,000",
IF(?records < 5000, "1k-5k",
IF(?records < 25000, "5k-25k",
IF(?records < 50000, "25k-50k",
">50k")))))
AS ?tag) .
}
GROUP BY ?itemLabel ?wikidata ?info ?records
ORDER BY DESC (?records)
|
|
References
- ↑ 1.0 1.1 1.2 1.3 1.4 1.5 Wikidata Query Service (WDQS) – run SPARQL Queries upon Wikidata. Run, test, download the data as json, csv or tsv. Has advanced user-friendly features such as : word hovering too see a term's meaning, code optimization, etc.
- ↑ 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 LinguaLibre Query Service (LLQS) – run SPARQL Queries upon LinguaLibre. Run, test, download the data as json, csv or tsv.