Help

Difference between revisions of "SPARQL for maintenance"

SPARQL for maintenance gather some SPARQL queries which help to monitors errors and bring fix to our resources.

Line 71: Line 71:
 
#ORDER BY DESC (?has_fields_separator_in_username)
 
#ORDER BY DESC (?has_fields_separator_in_username)
 
GROUP BY (?has_fields_separator_in_username)
 
GROUP BY (?has_fields_separator_in_username)
 +
</query>
 +
|}
 +
 +
 +
=== ✅ Recordings → Gascon `Q930` recordings containing reserved sign `<code>-</code>` ===
 +
:''See also [[:phabricator:T297635|T297635]] : Lingualibre filename separator should be mass migrated from <code>-</code> to <code>—</code>, without breaking usernames nor words.''
 +
{| style="width:100%"
 +
|- style="vertical-align:top;"
 +
|style="padding: 0 3em;width:60%"|
 +
<syntaxhighlight lang="sparql">
 +
#defaultEndpoint:Lingualibre
 +
SELECT *
 +
WHERE {
 +
  ?id prop:P2 entity:Q2 .
 +
  ?id prop:P4 entity:Q930 .
 +
  SERVICE wikibase:label {
 +
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
 +
    ?id rdfs:label ?name .
 +
  }
 +
  BIND (regex(STR(?name),"-") AS ?has_fields_separator_in_written_word)
 +
}
 +
ORDER BY DESC (?has_fields_separator_in_written_word)
 +
</syntaxhighlight>
 +
||
 +
<query _pagination="5">
 +
#defaultEndpoint:Lingualibre
 +
SELECT *
 +
WHERE {
 +
  ?id prop:P2 entity:Q2 .
 +
  ?id prop:P4 entity:Q930 .
 +
  SERVICE wikibase:label {
 +
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
 +
    ?id rdfs:label ?name .
 +
  }
 +
  BIND (regex(STR(?name),"-") AS ?has_fields_separator_in_written_word)
 +
}
 +
ORDER BY DESC (?has_fields_separator_in_written_word)
 +
</query>
 +
|}
 +
 +
{| style="width:100%"
 +
|- style="vertical-align:top;"
 +
|style="padding: 0 3em;width:60%"|
 +
<syntaxhighlight lang="sparql">
 +
#defaultEndpoint:Lingualibre
 +
SELECT ?has_fields_separator_in_username (COUNT(?has_fields_separator_in_username) AS ?found)
 +
WHERE {
 +
  ?id prop:P2 entity:Q2 .
 +
  ?id prop:P4 entity:Q930 .
 +
  SERVICE wikibase:label {
 +
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
 +
    ?id rdfs:label ?name .
 +
  }
 +
  BIND (regex(STR(?name),"-") AS ?has_fields_separator_in_written_word)
 +
  # filter( regex(?name, "-" ))
 +
}
 +
#ORDER BY DESC (?has_fields_separator_in_written_word)
 +
GROUP BY (?has_fields_separator_in_written_word)
 +
</syntaxhighlight>
 +
||
 +
<query _pagination="5">
 +
#defaultEndpoint:Lingualibre
 +
SELECT ?has_fields_separator_in_username (COUNT(?has_fields_separator_in_username) AS ?found)
 +
WHERE {
 +
  ?id prop:P2 entity:Q2 .
 +
  ?id prop:P4 entity:Q930 .
 +
  SERVICE wikibase:label {
 +
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
 +
    ?id rdfs:label ?name .
 +
  }
 +
 
 +
  BIND (regex(STR(?name),"-") AS ?has_fields_separator_in_written_word)
 +
  # filter( regex(?name, "-" ))
 +
}
 +
#ORDER BY DESC (?has_fields_separator_in_written_word)
 +
GROUP BY (?has_fields_separator_in_written_word)
 
</query>
 
</query>
 
|}
 
|}

Revision as of 09:01, 29 September 2023


Speakers

✅ Speakers → Username containing reserved sign `-`

See also T297635 : Lingualibre filename separator should be mass migrated from - to , without breaking usernames nor words.
#defaultEndpoint:Lingualibre
SELECT *
WHERE {
  ?id prop:P2 entity:Q3 .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
    ?id rdfs:label ?name .
  }
  BIND (regex(STR(?name),"-") AS ?has_fields_separator_in_username)
}
ORDER BY DESC (?has_fields_separator_in_username)
... Loading ...
#defaultEndpoint:Lingualibre
SELECT ?has_fields_separator_in_username (COUNT(?has_fields_separator_in_username) AS ?found)
WHERE {
  ?id prop:P2 entity:Q3 .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
    ?id rdfs:label ?name .
  }
  
  BIND (regex(STR(?name),"-") AS ?has_fields_separator_in_username)
  # filter( regex(?name, "-" ))
}
#ORDER BY DESC (?has_fields_separator_in_username)
GROUP BY (?has_fields_separator_in_username)
... Loading ...


✅ Recordings → Gascon `Q930` recordings containing reserved sign `-`

See also T297635 : Lingualibre filename separator should be mass migrated from - to , without breaking usernames nor words.
#defaultEndpoint:Lingualibre
SELECT *
WHERE {
  ?id prop:P2 entity:Q2 .
  ?id prop:P4 entity:Q930 .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
    ?id rdfs:label ?name .
  }
  BIND (regex(STR(?name),"-") AS ?has_fields_separator_in_written_word)
}
ORDER BY DESC (?has_fields_separator_in_written_word)
... Loading ...
#defaultEndpoint:Lingualibre
SELECT ?has_fields_separator_in_username (COUNT(?has_fields_separator_in_username) AS ?found)
WHERE {
  ?id prop:P2 entity:Q2 .
  ?id prop:P4 entity:Q930 .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
    ?id rdfs:label ?name .
  }
  BIND (regex(STR(?name),"-") AS ?has_fields_separator_in_written_word)
  # filter( regex(?name, "-" ))
}
#ORDER BY DESC (?has_fields_separator_in_written_word)
GROUP BY (?has_fields_separator_in_written_word)
... Loading ...

✅ Speakers → Speakers of `zho` to change to `cmn`

Should be maintained to 0.
#defaultEndpoint:Lingualibre
SELECT ?langLabel ?isoLabel (SUBSTR(STR(?speaker),32) AS ?speaker) ?speakerLabel
WHERE {
  ?speaker prop:P2 entity:Q3 .   # Filter: speakers
  ?speaker prop:P4 entity:Q130 .   # Filter: P4 language is Q130 zho  ;
  ?speaker prop:P4 ?lang .
  ?lang  prop:P13 ?iso .
  # Add Labels
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . } 
} ORDER BY ASC(?isoLabel)
# Helper: Chinese writing zho:Q130/wd:Q7850 ; Chinese mandarin cmn:Q113/wd:Q9192 .
... Loading ...

✅ Speakers → Undefined place of residence

The following may need precision. If stale, it mean none was found.
#defaultEndpoint:Lingualibre
SELECT ?item ?itemLabel ?filledGenderLabel
WHERE {
  ?item prop:P2 entity:Q3 .   # Filter: is Q3 `speaker`
  ?item prop:P14  .   # Filter: missing P14 `place of residence`   <-------------------------------------
  # Add labels to each variable used.
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
  } 
}
... Loading ...

✅ Speakers → Non-documented genders values

This query returns speakers who may have willfully declined to document a gender. See also list of genders.
#defaultEndpoint:Lingualibre
SELECT ?item ?itemLabel ?filledGenderLabel
WHERE {
  ?item prop:P2 entity:Q3 .  
  { ?item prop:P8 entity:Q710460 . }    # Filter: 'gender' is 'not documented'
  ?item prop:P8 ?filledGender .
  # Add labels to each variable used.
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
  } 
}
... Loading ...

✅ Speakers → missing gender value

This query returns speakers who have missing gender value. See also list of genders.
#defaultEndpoint:Lingualibre
SELECT ?speaker ?speakerLabel ?gender ?languageLabel (COUNT(?audio) AS ?audio)
WHERE {
  ?speaker prop:P2 entity:Q3 .
  FILTER NOT EXISTS { ?speaker prop:P8 ?gender }
  ?audio prop:P5 ?speaker .   # Filter: P5 'speaker' is Q445757 'SangeetaRH‎'
  ?audio prop:P4 ?language .  # Filter: P4 'language' is Q34 'Marathi'
  # Add labels
  SERVICE wikibase:label {bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en"} 
}
GROUP BY ?speaker ?speakerLabel ?genderLabel ?languageLabel  # Sorting first groups per language and speaker
ORDER BY DESC (?audio)
... Loading ...

Recordings

✅ Recordings → With ISO-639-3 `zho` to change to `cmn`

#defaultEndpoint:Lingualibre
SELECT ?langLabel ?isoLabel (SUBSTR(STR(?audio),32) AS ?Audio) ?audioLabel
WHERE {
  ?audio prop:P4 entity:Q130 .   # Filter: P4 language is Q130 zho  ;
  ?audio prop:P2 entity:Q2 .   # Filter: audios
  ?audio prop:P4 ?lang .
  ?lang  prop:P13 ?iso .
  # Add Labels
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . } 
} ORDER BY ASC(?isoLabel)
# Helper: Chinese writing zho:Q130/wd:Q7850 ; Chinese mandarin cmn:Q113/wd:Q9192 .
... Loading ...


#defaultEndpoint:Lingualibre
SELECT ?isoLabel (COUNT(?iso) AS ?found)
WHERE {
  ?audio prop:P4 entity:Q130 .   # Filter: P4 language is Q130 zho  ;
  ?audio prop:P2 entity:Q2 .   # Filter: audios
  ?audio prop:P4 ?lang .
  ?lang  prop:P13 ?iso .
  # Add Labels
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . } 
} GROUP BY ?isoLabel
# Helper: Chinese writing zho:Q130/wd:Q7850 ; Chinese mandarin cmn:Q113/wd:Q9192
... Loading ...


Languages

✅ Languages → duplicata by Wikidata ID

Duplicated languages can be merged. If stale, it mean none was found.
#defaultEndpoint:Lingualibre
SELECT ?idWD (COUNT(?item) AS ?count) (GROUP_CONCAT(?item) AS ?items)
WHERE {
  ?item prop:P2 entity:Q4 ;
        prop:P12 ?idWD .
}
GROUP BY ?idWD
HAVING ( ?count > 1 )
... Loading ...

✅ Languages → list of values used including redirects

If some languageLabels display full uri, there is duplicata. Erroneous data should be corrected or deleted on both Commons and Lingualibre. (Q: which tool?)
#defaultEndpoint:Lingualibre
SELECT
(COUNT(DISTINCT ?record) AS ?records)
(COUNT(DISTINCT ?speaker) AS ?speakers)
?languageLabel
WHERE {
  ?record prop:P2 entity:Q2 .
  ?record prop:P5 ?speaker .
  ?record prop:P4 ?language .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

}
GROUP BY ?languageLabel 
ORDER BY ?languageLabel
... Loading ...
✅ Recordings → With qid `Q170137` redirecting to `Q359` Mossi language
✅ Users linguistic profile has been corrected into Q359. Erroneous recordings language (P4) on Lingualibre, while Commons use correct Wikidata language Qid.
#defaultEndpoint:Lingualibre
SELECT ?lang 
?audio ?audioLabel 
(SUBSTR(STR(?speaker),32) AS ?SpeakerQid) ?speakerLabel 
?url (SUBSTR(STR(?urlLabel),52) AS ?filename)
WHERE {
  ?audio prop:P4 entity:Q170137 .   # Filter: P4 language is Q170137 (redirect) ;
  ?audio prop:P2 entity:Q2 .        # Filter: audios
  ?audio prop:P4 ?lang .
  ?audio prop:P5 ?speaker .
  ?audio prop:P3 ?url .
  # ?lang  prop:P13 ?iso .
  # Add Labels
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . } 
} ORDER BY ASC(?audio)
# Helper: .
... Loading ...
✅ Recordings → With qid `Q52071` redirecting to `Q73` Duala language
✅ Users linguistic profile has been corrected into Q73. Erroneous recordings language (P4) on Lingualibre, while Commons use correct Wikidata language Qid.
#defaultEndpoint:Lingualibre
SELECT ?lang (SUBSTR(STR(?audio),32) AS ?Audio) ?audioLabel (SUBSTR(STR(?speaker),32) AS ?SpeakerQid)  ?speakerLabel
WHERE {
  ?audio prop:P4 entity:Q52071 .   # Filter: P4 language is Q52071 (redirect) ;
  ?audio prop:P2 entity:Q2 .        # Filter: audios
  ?audio prop:P4 ?lang .
  ?audio prop:P5 ?speaker .
  # ?lang  prop:P13 ?iso .
  # Add Labels
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . } 
} ORDER BY ASC(?Audio)
# Helper: .
... Loading ...

✅ Languages → languages by type (if it exists)

Note: query to improve.
#defaultEndpoint:Lingualibre
SELECT ?language ?languageLabel ?code ?typeLabel
WHERE {
  ?language prop:P2 entity:Q4 .
  OPTIONAL { ?language prop:P13 ?code } .
  OPTIONAL { 
    ?language rdfs:label ?languageLabel .
    FILTER( lang(?languageLabel) = "en" ) } .
  OPTIONAL { ?language prop:P24 ?type } .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
  } 
}
ORDER BY ?typeLabel
... Loading ...

✅ Languages → language with media type (P24) is video (Q88890)

#defaultEndpoint:Lingualibre
SELECT ?id ?idLabel WHERE { 
  ?id prop:P2 entity:Q4 . 
  ?id prop:P24 entity:Q88890 . 
  SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,en". } 
}
... Loading ...

✅ Languages with media type video → written word (P7), url (P3) speakers (P5)

#defaultEndpoint:Lingualibre
SELECT ?word ?filename ?speaker WHERE { 
  ?record prop:P4 entity:Q99628 . 
  ?record prop:P2 entity:Q2 . 
  ?record prop:P7 ?word . 
  ?record prop:P3 ?filename . 
  ?record prop:P5 ?speakerItem . 
  ?speakerItem rdfs:label 
  ?speaker filter ( lang( ?speaker ) = "en" ) . 
}
... Loading ...


✅ Languages iso from SignIt → relevant data

See Translatewiki:Translating:Lingua_Libre_SignIt.
# List from https://translatewiki.net/wiki/Translating:Lingua_Libre_SignIt#sortable:3=desc
# 2023.07.26
SELECT ?wdQid ?wikimediaCode ?i18nCode ?labelEN 
  (SAMPLE(?nativeLabel) as ?labelNative)
WHERE {
    # languages & scripts
    VALUES ?i18nCode { 
    "anp" "ar" "bn" "br" "de" "en" "es" "fa" "fi" "fr"
    "he" "hi" "ia" "id" "it" "ja" "kk-cyrl" "ko" "krc" "lmo"
    "mk" "ms" "nb" "pnb" "pt" "pt-br" "qqq" "ru" "scn" "sl"
    "sv" "sw" "tl" "tr" "uk" "zh-hans" "zh-hant" "mnw" "hu"
    "kk" "zh"
 }
    # Scripts, not languages
    VALUES ?i18nScript { "kk-cyrl" "zh-hans" "zh-hant" }
  
    { ?langId wdt:P218 ?i18nCode. } # has for iso 639-1 code
    UNION {?langId wdt:P219 ?i18nCode. } # has for iso 639-2 code
    UNION {?langId wdt:P220 ?i18nCode. } # has for iso 639-3 code
    UNION {?langId wdt:P305 ?i18nCode. } # has for isoIETF code
  
  #  UNION {?langId wdt:P31 wd:Q1149626 ; # has for instance written language
  #                 wdt:P424 ?i18nScript ; # has for wikimedia code
  #                 rdfs:label ?labelEN . }

    # UNION { ?langId wdt:P424 ?i18nCode. }  # has for wikimedia code

    OPTIONAL { ?langId wdt:P424 ?wikimediaCode. }
    OPTIONAL { ?langId wdt:P1705 ?native. }
            # wdt:P10369 ?lingualibreQid ;
    ?langId rdfs:label ?labelEN .
  FILTER ( lang(?labelEN) = "en" )
  BIND(REPLACE(STR(?langId), ".*/Q", "Q") AS ?wdQid)
  BIND(CONCAT(UCASE(SUBSTR(?native, 1 , 1 )), SUBSTR(?native, 2 )) AS ?nativeLabel)
}
GROUP BY ?wdQid ?wikimediaCode ?i18nCode ?labelEN
ORDER BY ASC(?i18nCode)
... Loading ...

Other

All properties with their Wikidata equivalent (if it exists)

#defaultEndpoint:Lingualibre
SELECT * WHERE {
  { ?propLili rdf:type owl:ObjectProperty }
  UNION
  { ?propLili rdf:type owl:DatatypeProperty }
  OPTIONAL { 
    ?propLiliClaim <http://wikiba.se/ontology#directClaim> ?propLili ;
       prop:P12 ?propWD
  }
}
... Loading ...

See also

Lingua Libre technical helps
Template {{Speakers category}} • {{Recommended lists}} • {{To iso 639-2}} • {{To iso 639-3}} • {{Userbox-records}} • {{Bot steps}}
Audio files How to create a frequency list?Convert files formatsDenoise files with SoXRename and mass rename
Bots Help:BotsLinguaLibre:BotHelp:Log in to Lingua Libre with PywikibotLingua Libre Bot (gh) • OlafbotPamputtBotDragons Bot (gh)
MediaWiki MediaWiki: Help:Documentation opérationelle MediawikiHelp:Database structureHelp:CSSHelp:RenameHelp:OAuthLinguaLibre:User rights (rate limit) • Module:Lingua Libre record & {{Lingua Libre record}}JS scripts: MediaWiki:Common.jsLastAudios.jsSoundLibrary.jsItemsSugar.jsLexemeQueriesGenerator.js (pad) • Sparql2data.js (pad) • LanguagesGallery.js (pad) • Gadgets: Gadget-LinguaImporter.jsGadget-Demo.jsGadget-RecentNonAudio.jsLiLiZip.js
Queries Help:APIsHelp:SPARQLSPARQL (intermediate) (stub) • SPARQL for lexemes (stub) • SPARQL for maintenanceLingualibre:Wikidata (stub) • Help:SPARQL (HAL)
Reuses Help:Download datasetsHelp:Embed audio in HTML
Unstable & tests Help:SPARQL/test
Categories Category:Technical reports