Help
SPARQL (intermediate)
Revision as of 10:51, 16 November 2022 by Yug (talk | contribs) (→✅ Language () → List of WD languages with speaker population >80M)
Help:SPARQL 2 will explore federated queries fetching data from both LinguaguaLibre and Wikidata's endpoints, then Wikidata Lexemes, an emerging source of lexicographic data. The duo can be a solid combo to provide lexicographic and multimedia (audio recordings and images) for either Wikimedia modules or web developers.
Revision as of 10:51, 16 November 2022 by Yug (talk | contribs) (→✅ Language () → List of WD languages with speaker population >80M)
Tools
Lexemes Queries Generator
SPARQL to persitent data
Some SPARQL queries are meaningful but heavy and overly slow. This administrator tool stores or updates the response data on LinguaLibre, within a wikipage. Stored data can then be loaded in <0.1 second. Multiple data can also be merged via a common property if any.
Federated queries
- To query Lingualibre from Wikidata, use
SERVICE <https://lingualibre.org/sparql>
. - To query Wikidata from LinguaLibre, use
SERVICE <https://query.wikidata.org/sparql>
.
Retrieve data of LinguaLibre from Wikidata
To run on WDQS.[1] It lists the existing levels in LinguaLibre.
PREFIX prop: <https://lingualibre.org/prop/direct/>
PREFIX entity: <https://lingualibre.org/entity/>
SELECT * {
SERVICE <https://lingualibre.org/sparql> {
SELECT
?item
?itemLabel
{
?item prop:P2 entity:Q5.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
}
}
Retrieve data of Wikidata from LinguaLibre
To run on LLQS.[2]
# [Example to complete]
Notable elements
LinguaLibre endpoint | Wikidata endpoint | |
---|---|---|
For languages:
|
For countries:
For places:
|
Languages
✅ Language () → List of WD languages with speaker population >80M
To run on WDQS.[1]
SELECT DISTINCT ?item ?ISO ?itemLabel WHERE {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
?item p:P1098 ?statement0.
?item p:P219 ?isolang.
?statement0 (psv:P1098/wikibase:quantityAmount) ?numericQuantity.
FILTER(?numericQuantity > "80000000"^^xsd:decimal)
MINUS {
?item p:P31 ?statement1.
?statement1 (ps:P31/(wdt:P279*)) wd:Q25295.
}
OPTIONAL { ?item wdt:P218 ?ISO. }
}
ORDER BY ASC (?ISO)
LIMIT 100
|
|
✅ Language () → List of LL languages with wd speaker population
✅ Languages → List of Sign languages on wikidata
To run on WDQS.[1]
SELECT DISTINCT ?item ?itemLabel
WHERE {
?item p:P31 ?statement0.
?statement0 (ps:P31/(wdt:P279*)) wd:Q34228.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
}
|
|
✅ Languages → List of LL languages with wikidata dead or extinct status
To run on LLQS.[2]
SELECT
?deadLanguageLinguaLibre
?deadLanguageLinguaLibreLabel
?audios
# List Wikidata dead/extinct languages
WITH {
SELECT DISTINCT ?deadLanguage {
SERVICE <https://query.wikidata.org/sparql> {
{ ?deadLanguage wdt:P31/wdt:P279* wd:Q45762. }
UNION
{ ?deadLanguage wdt:P31/wdt:P279* wd:Q38058796. }
}
}
} AS %deadLanguage
# Compare with LinguaLibre languages, keep when P12 `wikidata` matches
WITH {
SELECT ?deadLanguageLinguaLibre {
INCLUDE %deadLanguage.
BIND(REPLACE(STR(?deadLanguage), '.*/', '') AS ?deadLanguageQid)
?deadLanguageLinguaLibre
prop:P2 entity:Q4;
prop:P12 ?deadLanguageQid.
}
} AS %deadLanguageLinguaLibre
# For those Lingualibre languages, count audios into ?audios
WITH {
SELECT
?deadLanguageLinguaLibre
(COUNT(?audio) AS ?audios)
{
INCLUDE %deadLanguageLinguaLibre.
?audio
prop:P2 entity:Q2;
prop:P4 ?deadLanguageLinguaLibre.
}
GROUP BY ?deadLanguageLinguaLibre
} AS %audios
# What is that ? Back to main scope ? :
{
INCLUDE %deadLanguageLinguaLibre.
OPTIONAL{ INCLUDE %audios. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY DESC(?audios)
|
|
✅ Languages → List of LL languages with wikidata Endangered language status
To run on LLQS.[2]
SELECT
?endangeredLanguageLinguaLibre
?endangeredLanguageLinguaLibreLabel
?audios
# List Wikidata endangered languages
WITH {
SELECT DISTINCT ?endangeredLanguage {
SERVICE <https://query.wikidata.org/sparql> {
{ ?endangeredLanguage wdt:P31/wdt:P279* wd:Q335214. }
UNION
{ ?endangeredLanguage wdt:P31/wdt:P279* wd:Q83365366. }
}
}
} AS %endangeredLanguage
# Compare with LinguaLibre languages, keep when P12 `wikidata` matches
WITH {
SELECT ?endangeredLanguageLinguaLibre {
INCLUDE %endangeredLanguage.
BIND(REPLACE(STR(?endangeredLanguage), '.*/', '') AS ?endangeredLanguageQid)
?endangeredLanguageLinguaLibre
prop:P2 entity:Q4;
prop:P12 ?endangeredLanguageQid.
}
} AS %endangeredLanguageLinguaLibre
# For those Lingualibre languages, count audios into ?audios
WITH {
SELECT
?endangeredLanguageLinguaLibre
(COUNT(?audio) AS ?audios)
{
INCLUDE %endangeredLanguageLinguaLibre.
?audio
prop:P2 entity:Q2;
prop:P4 ?endangeredLanguageLinguaLibre.
}
GROUP BY ?endangeredLanguageLinguaLibre
} AS %audios
# What is that ? Back to main scope ? :
{
INCLUDE %endangeredLanguageLinguaLibre.
OPTIONAL{ INCLUDE %audios. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY DESC(?audios)
|
|
✅ Languages → List of LL languages with wikidata Sign language status
To run on LLQS.[2]
SELECT
?signLanguageLinguaLibre
?signLanguageLinguaLibreLabel
?audios
# List Wikidata sign languages
WITH {
SELECT DISTINCT ?signLanguage {
SERVICE <https://query.wikidata.org/sparql> {
{ ?signLanguage wdt:P31/wdt:P279* wd:Q34228. }
}
}
} AS %signLanguage
# Compare with LinguaLibre languages, keep when P12 `wikidata` matches
WITH {
SELECT ?signLanguageLinguaLibre {
INCLUDE %signLanguage.
BIND(REPLACE(STR(?signLanguage), '.*/', '') AS ?signLanguageQid)
?signLanguageLinguaLibre
prop:P2 entity:Q4;
prop:P12 ?signLanguageQid.
}
} AS %signLanguageLinguaLibre
# For those Lingualibre languages, count audios into ?audios
WITH {
SELECT
?signLanguageLinguaLibre
(COUNT(?audio) AS ?audios)
{
INCLUDE %signLanguageLinguaLibre.
?audio
prop:P2 entity:Q2;
prop:P4 ?signLanguageLinguaLibre.
}
GROUP BY ?signLanguageLinguaLibre
} AS %audios
# What is that ? Back to main scope ? :
{
INCLUDE %signLanguageLinguaLibre.
OPTIONAL{ INCLUDE %audios. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
ORDER BY DESC(?audios)
|
|
❌ Language (Q34) → Wikidata Qid(s) → Geo-coordinates
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX ll: <https://lingualibre.org/entity/>
PREFIX llt: <https://lingualibre.org/prop/direct/>
PREFIX lltn: <https://lingualibre.org/prop/direct-normalized/>
select distinct ?record ?transcription ?languageLabel ?wdQid ?wdQidLabel ?wdLabel ?coord
where {
?record llt:P2 ll:Q2 . # Filter: P2 'instance of' is Q2 'record'
?record llt:P4 ll:Q34 . # Filter: record's P4 'language' is Q34 'Marathi'
?record llt:P4 ?language . # Assign value: record's P4 'language' to variable ?language
?record llt:P7 ?transcription . # Assign value: record's P7 'transcription' to variable ?transcription
?record lltn:P12 ?wdQid . # Assign value: record's P12 'wikidata id' to variable ?wikidataItem
SERVICE <https://query.wikidata.org/sparql> {
OPTIONAL { ?wdQid wdt:P625 ?coord . } # Assign value: wikidata item's wd:P625 'coordinates' to variable ?coord
OPTIONAL {
?wdQid rdfs:label ?wdLabel . # Assign value: wikidata item's label to variable ?wikidataLabel
FILTER (LANG(?wdLabel) = "en") . # Filter: default language, else English
}
}
SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
}
}
|
Result type:record transcription languageLabel wdQid wdQidLabel wdLabel coord Q196212 Tathavade Marathi Q2719024 Q2719024 Tathavade Point(73.74 18.62) Q428904 Jambavade Marathi Q24894740 Q24894740 Jambavade Point(73.85 18.51) Q428900 Dhangavhan Marathi Q24885008 Q24885008 Dhangavhan Point(73.85 18.52) … |
✅ Languages → Languages with gender and recordings counts
To run on LLQS.[2]
SELECT ?languageLabel ?wikidata ?iso
?malesSpeakers ?malesRecords ?femalesSpeakers ?femalesRecords
(ROUND(1000*?femalesRecords/(?femalesRecords+?malesRecords))/10 AS ?percent)
WITH {
SELECT ?language ?languageLabel ?wikidata ?iso {
?record prop:P2 entity:Q2 . # Filter: P2 'instance of' is Q2 'record'
?record prop:P4 ?language . # Assign value: P4 'language' into ?language
?language prop:P12 ?wikidata . # Assign value: P12 'wikidata id' into ?wikidata
OPTIONAL { ?language prop:P13 ?iso . } # Assign value: P13 'iso639-3' into ?iso
}
GROUP BY ?language ?languageLabel ?wikidata ?iso
} AS %base
WITH {
SELECT ?language ?languageLabel ?iso ?genderLabel
(COUNT(DISTINCT ?females) AS ?femalesSpeakers)
(COUNT(DISTINCT ?record) AS ?femalesRecords) {
INCLUDE %base
?record prop:P4 ?language ; # Filter
prop:P5 ?females . # Assign value: P5 'speaker' into ?females
?females prop:P8 entity:Q17 ; # Filter
prop:P8 ?gender . # Assign value: P8 'gender' into ?gender
} GROUP BY ?language ?languageLabel ?iso ?genderLabel
} AS %females
WITH {
SELECT ?language ?languageLabel ?iso ?genderLabel
(COUNT(DISTINCT ?males) AS ?malesSpeakers)
(COUNT(DISTINCT ?record) AS ?malesRecords) {
INCLUDE %base
?record prop:P4 ?language ;
prop:P5 ?males . # Assign value: P5 'speaker' into variable ?speakerQid
?males prop:P8 entity:Q16 ;
prop:P8 ?gender .
} GROUP BY ?language ?languageLabel ?iso ?genderLabel
} AS %males
{
INCLUDE %base
INCLUDE %females
INCLUDE %males
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
GROUP BY ?languageLabel ?wikidata ?iso ?malesSpeakers ?malesRecords ?femalesSpeakers ?femalesRecords
ORDER BY ASC(?languageLabel )
|
|
[HEAVY] Languages → Languages with gender and recordings counts (2)
To run on LLQS.[2]
SELECT (?languageLabel as ?Language) ?iso
(?genderLabel as ?Gender)
(COUNT(DISTINCT ?speakerQid) as ?Speakers)
(COUNT(DISTINCT ?record) as ?Records)
WHERE {
?record prop:P2 entity:Q2 . # Filter: items where P2 'instance of' is Q2 'record'
?record prop:P4 ?language . # Filter: items where P4 'language' is Q34 'Marathi'
# OPTIONAL { ?language prop:P12 ?wikidata } # Assign value: P12 'wikidata id' into variable ?WD
OPTIONAL { ?language prop:P13 ?iso } # Assign value: P13 'iso639-3' into ?isoCode
?record prop:P5 ?speakerQid . # Assign value: P5 'speaker' into variable ?speakerQid
?speakerQid prop:P8 ?gender . # Assign value: P8 'sex or gender'
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }
}
GROUP BY ?languageLabel ?iso ?genderLabel
ORDER BY ASC(?languageLabel )
|
Language iso Gender Speakers Records
Acehnese ace male 1 235
Afrikaans afr male 3 2943
Alemannic gsw male 1 2
Algerian Ar. arq male 2 234
Algerian Ar. arq female 2 5
Amharic amh male 1 8
Amharic amh female 1 209
Anc. Greek grc male 1 15
Arabic ara male 12 5627
Arabic ara female 2 9
|
Lexemes
Notable elements
Part of speech
To run on WDQS.[1]
SELECT ?item ?itemLabel
WHERE {
?item wdt:P31 wd:Q82042 # Q82042 'part of speech'
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
# GROUP BY ?item
ORDER BY ASC(?itemLabel)
|
✅ Language (d:Q12107) → List of Wikidata lexemes for Breton
To run on WDQS.[1] Source: User:VIGNERON/common.js.
SELECT DISTINCT ?lexemeLabel ?lexeme
WITH {
SELECT ?lexeme ?lexemeLabel ?lexical_category WHERE {
?lexeme a ontolex:LexicalEntry ;
dct:language wd:Q12107 ;
wikibase:lemma ?lexemeLabel .
OPTIONAL {
?lexeme wikibase:lexicalCategory ?lexical_category .
}
}
} AS %results
WHERE {
INCLUDE %results
OPTIONAL {
?lexical_category rdfs:label ?lexical_categoryLabel .
FILTER (LANG(?lexical_categoryLabel) = "en")
}
}
|
|
SELECT ?lexeme ?lemma
WHERE {
?lexeme dct:language wd:Q12107;
wikibase:lemma ?lemma.
}
|
✅ Language () → List of wd lexemes with LL audio
To run on WDQS.[1]
SELECT * WHERE {
?l ontolex:lexicalForm ?x .
?x wdt:P443 ?value .
FILTER regex (str(?value), "^http://commons.wikimedia.org/wiki/Special:FilePath/LL-").
}
|
✅ Language () → List of wd lexemes with (LL audio and) wd translation (d:Q150)
✅ Concept (Q3142) → All lexeme whom sense is this concept
To run on WDQS.[1]
#title: Lexemes with senses linked to the item about the colour "red" (Q314 SELECT ?wikidataLexeme ?languageLabel ?lemma ?sense WHERE { ?wikidataLexeme dct:language ?language ; wikibase:lemma ?lemma ; ontolex:sense ?sense. ?language rdfs:label ?languageLabel . Filter(lang(?languageLabel)="en"). ?sense wdt:P5137 wd:Q3142 . # Filter : P5137 'item for this sense' is Q3142 'red' }
Speakers
✅ Speakers → Largest number of languages recorded and known
#Title: Speakers with recordings largest number of languages and known languages
SELECT ?speaker ?speakerLabel ?count ?languages
# Get audios, language, speaker triplet
WITH {
SELECT DISTINCT ?speaker ?language {
?audio prop:P4 ?language;
prop:P5 ?speaker.
}
} AS %speakers
# Get the count of languages per each speaker
WITH {
SELECT ?speaker (COUNT(?speaker) AS ?count) {
INCLUDE %speakers.
}
GROUP BY ?speaker
ORDER BY DESC(?count)
} AS %countOfLanguagesRecordedPerSpeaker
# Get the maximum number of languages per each speaker
WITH {
SELECT (MAX(?count) AS ?maxNumberOfLanguagesRecorded) {
INCLUDE %countOfLanguagesRecordedPerSpeaker.
}
} AS %maxNumberOfLanguagesRecorded
# Get those speakers whose count equals the maximum number of languages
WITH {
SELECT ?speaker ?count {
INCLUDE %countOfLanguagesRecordedPerSpeaker.
INCLUDE %maxNumberOfLanguagesRecorded.
FILTER(?count = ?maxNumberOfLanguagesRecorded).
}
} AS %speakersWithMostNumberOfLanguagesRecorded
# Get the languages of those speakers that have recorded audios in the
# most number of languages
WITH {
SELECT ?speaker (GROUP_CONCAT(?languageLabel; SEPARATOR = ", ") AS ?languages) {
INCLUDE %speakersWithMostNumberOfLanguagesRecorded.
?speaker prop:P4 [
rdfs:label ?languageLabel
]
FILTER(LANG(?languageLabel) = "en").
}
GROUP BY ?speaker
} AS %languagesOfSpeakersWithMostNumberOfLanguagesRecorded
{
INCLUDE %speakersWithMostNumberOfLanguagesRecorded.
INCLUDE %languagesOfSpeakersWithMostNumberOfLanguagesRecorded.
?speaker rdfs:label ?speakerLabel.
FILTER(LANG(?speakerLabel) = "en")
}
|
|
✅ Speakers → Countries with most speakers
SELECT ?country ?continentLabel ?ISO3 ?countryLabel (COUNT(?country) AS ?count)
WITH {
SELECT DISTINCT ?speaker {
?speaker prop:P2 entity:Q3;
}
} AS %speakers
WITH {
SELECT DISTINCT
?speaker
?country
?countryLabel
?ISO3
?continentLabel
{
INCLUDE %speakers.
?speaker prop:P14 ?residence.
# Avoids weird errors.
FILTER(REGEX(?residence, "^Q[0-9]+$"))
BIND(IRI(CONCAT('http://www.wikidata.org/entity/', ?residence)) AS ?residenceId)
# Get country from wikidata
SERVICE <https://query.wikidata.org/sparql> {
?residenceId wdt:P17 ?country.
?country rdfs:label ?countryLabel;
wdt:P298 ?ISO3;
wdt:P30 ?continent.
?continent rdfs:label ?continentLabel.
FILTER(LANG(?countryLabel) = "en").
FILTER(LANG(?continentLabel) = "en").
}
}
} AS %speakersWithCountries
{
INCLUDE %speakersWithCountries.
}
GROUP BY ?country ?continentLabel ?ISO3 ?countryLabel
ORDER BY DESC(?count)
|
|
Speakers → Map of speakers by place
To run on WDQS.[1]
#defaultView:Map
PREFIX ll: <https://lingualibre.org/entity/>
PREFIX llt: <https://lingualibre.org/prop/direct/>
SELECT DISTINCT ?lLabel ?coord WITH {
SELECT ?lLabel ?loc WHERE {
SERVICE <https://lingualibre.org/sparql> {
select DISTINCT ?lLabel ?loc {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
?l llt:P2 ll:Q3 ;
llt:P14 ?loc .
?record llt:P5 ?l.
FILTER (regex(?loc, '^Q'))
}
}
}
} AS %i
WHERE {
INCLUDE %i
BIND (URI(CONCAT("http://www.wikidata.org/entity/", ?loc)) AS ?locURL)
SERVICE <https://query.wikidata.org/sparql> {
select * {
?locURL wdt:P625 ?coord .
}
}
}
|
|
Speakers → Map of speakers with recordings
To run on WDQS.[1]
#defaultView:Map
# Same can apply to languages :
# Q3→Q4 : items speaker and language
# P14→P12 : items WD location and WD language
# P5→P4 : properties speaker and languages
# P625→P1098 : properties location and population
PREFIX ll: <https://lingualibre.org/entity/>
PREFIX llt: <https://lingualibre.org/prop/direct/>
SELECT DISTINCT ?itemLabel ?wikidata (?info AS ?coordinates) ?records (SAMPLE(?tag) AS ?layer)
# On Lingualibre, get item's info
WITH {
SELECT *
WHERE {
SERVICE <https://lingualibre.org/sparql> { # Commented on LLQS only
SELECT DISTINCT ?itemLabel ?wikidata (COUNT(?record) AS ?records)
{
?item llt:P2 ll:Q3 ;
rdfs:label ?itemLabel;
llt:P14 ?wikidata .
?record llt:P5 ?item .
FILTER (LANG(?itemLabel)='en')
FILTER (regex(?wikidata, '^Q'))
} GROUP BY ?itemLabel ?wikidata
} # Commented on LLQS only
}
} AS %infoWikidataId
# On Wikidata, get the target info
WHERE {
INCLUDE %infoWikidataId
BIND (URI(CONCAT("http://www.wikidata.org/entity/", ?wikidata)) AS ?infoURL)
SERVICE <https://query.wikidata.org/sparql> {
SELECT * {
?infoURL wdt:P625 ?info .
}
}
BIND(
IF(?records < 10, "<10",
IF(?records < 1000, "10-1,000",
IF(?records < 5000, "1k-5k",
IF(?records < 25000, "5k-25k",
IF(?records < 50000, "25k-50k",
">50k")))))
AS ?tag) .
}
GROUP BY ?itemLabel ?wikidata ?info ?records
ORDER BY DESC (?records)
|
itemLabel wikidata coord records layer
नंदिनी रानडे Q1538 Point(73.85 18.51) 991 10-1,000
Luilui6666 Q8646 Point(114.15 22.27) 6198 5k-25k |
References
- ↑ 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 Wikidata Query Service (WDQS) – run SPARQL Queries upon Wikidata. Run, test, download the data as json, csv or tsv. Has advanced user-friendly features such as : word hovering too see a term's meaning, code optimization, etc.
- ↑ 2.0 2.1 2.2 2.3 2.4 2.5 LinguaLibre Query Service (LLQS) – run SPARQL Queries upon LinguaLibre. Run, test, download the data as json, csv or tsv.